user_by_id / user_by_login / tweet_details return none in case of resource not accessible api; add dump when parsing failed

Этот коммит содержится в:
Vlad Pronsky 2023-07-30 15:37:43 +03:00
родитель a378a9721e
Коммит fbbe4b61a9
2 изменённых файлов: 70 добавлений и 28 удалений

Просмотреть файл

@ -4,9 +4,9 @@ from httpx import Response
from .accounts_pool import AccountsPool
from .constants import * # noqa: F403
from .logger import set_log_level
from .models import Tweet, User, get_tweets, get_users
from .models import parse_tweet, parse_tweets, parse_user, parse_users
from .queue_client import QueueClient
from .utils import encode_params, find_obj, get_by_path, to_old_obj, to_old_rep
from .utils import encode_params, find_obj, get_by_path
SEARCH_FEATURES = {
"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": True,
@ -95,7 +95,7 @@ class API:
async def search(self, q: str, limit=-1, kv=None):
async for rep in self.search_raw(q, limit=limit, kv=kv):
for x in get_tweets(rep.json(), limit):
for x in parse_tweets(rep.json(), limit):
yield x
# user_by_id
@ -112,8 +112,7 @@ class API:
async def user_by_id(self, uid: int, kv=None):
rep = await self.user_by_id_raw(uid, kv=kv)
res = rep.json()
return User.parse(to_old_obj(res["data"]["user"]["result"]))
return parse_user(rep)
# user_by_login
@ -130,8 +129,7 @@ class API:
async def user_by_login(self, login: str, kv=None):
rep = await self.user_by_login_raw(login, kv=kv)
res = rep.json()
return User.parse(to_old_obj(res["data"]["user"]["result"]))
return parse_user(rep)
# tweet_details
@ -163,9 +161,7 @@ class API:
async def tweet_details(self, twid: int, kv=None):
rep = await self.tweet_details_raw(twid, kv=kv)
obj = to_old_rep(rep.json())
doc = obj["tweets"].get(str(twid), None)
return Tweet.parse(doc, obj) if doc else None
return parse_tweet(rep, twid)
# followers
@ -177,7 +173,7 @@ class API:
async def followers(self, uid: int, limit=-1, kv=None):
async for rep in self.followers_raw(uid, limit=limit, kv=kv):
for x in get_users(rep.json(), limit):
for x in parse_users(rep.json(), limit):
yield x
# following
@ -190,7 +186,7 @@ class API:
async def following(self, uid: int, limit=-1, kv=None):
async for rep in self.following_raw(uid, limit=limit, kv=kv):
for x in get_users(rep.json(), limit):
for x in parse_users(rep.json(), limit):
yield x
# retweeters
@ -203,7 +199,7 @@ class API:
async def retweeters(self, twid: int, limit=-1, kv=None):
async for rep in self.retweeters_raw(twid, limit=limit, kv=kv):
for x in get_users(rep.json(), limit):
for x in parse_users(rep.json(), limit):
yield x
# favoriters
@ -216,7 +212,7 @@ class API:
async def favoriters(self, twid: int, limit=-1, kv=None):
async for rep in self.favoriters_raw(twid, limit=limit, kv=kv):
for x in get_users(rep.json(), limit):
for x in parse_users(rep.json(), limit):
yield x
# user_tweets
@ -237,7 +233,7 @@ class API:
async def user_tweets(self, uid: int, limit=-1, kv=None):
async for rep in self.user_tweets_raw(uid, limit=limit, kv=kv):
for x in get_tweets(rep.json(), limit):
for x in parse_tweets(rep.json(), limit):
yield x
# user_tweets_and_replies
@ -258,7 +254,7 @@ class API:
async def user_tweets_and_replies(self, uid: int, limit=-1, kv=None):
async for rep in self.user_tweets_and_replies_raw(uid, limit=limit, kv=kv):
for x in get_tweets(rep.json(), limit):
for x in parse_tweets(rep.json(), limit):
yield x
# list timeline
@ -275,5 +271,5 @@ class API:
async def list_timeline(self, list_id: int, limit=-1, kv=None):
async for rep in self.list_timeline_raw(list_id, limit=limit, kv=kv):
for x in get_tweets(rep, limit):
for x in parse_tweets(rep, limit):
yield x

Просмотреть файл

@ -1,6 +1,10 @@
import email.utils
import json
import os
import random
import re
import string
import traceback
from dataclasses import asdict, dataclass, field
from datetime import datetime
from typing import Generator, Optional
@ -8,7 +12,7 @@ from typing import Generator, Optional
import httpx
from .logger import logger
from .utils import find_item, get_or, int_or_none, to_old_rep
from .utils import find_item, get_or, int_or_none, to_old_obj, to_old_rep
@dataclass
@ -323,6 +327,9 @@ class Media(JSONTrait):
return Media(photos=photos, videos=videos, animated=animated)
# internal helpers
def _get_reply_user(tw_obj: dict, res: dict):
user_id = tw_obj.get("in_reply_to_user_id_str", None)
if user_id is None:
@ -381,10 +388,25 @@ def _get_views(obj: dict, rt_obj: dict):
return None
# reply parsing
def _write_dump(kind: str, e: Exception, x: dict, obj: dict):
uniq = "".join(random.choice(string.ascii_lowercase) for _ in range(5))
time = datetime.utcnow().strftime("%Y-%m-%d_%H-%M-%S")
dumpfile = f"/tmp/twscrape/twscrape_parse_error_{time}_{uniq}.txt"
os.makedirs(os.path.dirname(dumpfile), exist_ok=True)
with open(dumpfile, "w") as fp:
msg = [
f"Error parsing {kind}. Error: {type(e)}",
traceback.format_exc(),
json.dumps(x, default=str),
json.dumps(obj, default=str),
]
fp.write("\n\n".join(msg))
logger.error(f"Failed to parse response of {kind}, writing dump to {dumpfile}")
def get_items(rep: httpx.Response, kind: str, limit: int = -1):
def _parse_items(rep: httpx.Response, kind: str, limit: int = -1):
if kind == "user":
Cls = User
key = "users"
@ -400,15 +422,39 @@ def get_items(rep: httpx.Response, kind: str, limit: int = -1):
if limit != -1 and len(ids) >= limit:
break
tmp = Cls.parse(x, obj)
if tmp.id not in ids:
ids.add(tmp.id)
yield tmp
try:
tmp = Cls.parse(x, obj)
if tmp.id not in ids:
ids.add(tmp.id)
yield tmp
except Exception as e:
_write_dump(kind, e, x, obj)
continue
def get_tweets(rep: httpx.Response, limit: int = -1) -> Generator[Tweet, None, None]:
return get_items(rep, "tweet", limit) # type: ignore
# public helpers
def get_users(rep: httpx.Response, limit: int = -1) -> Generator[User, None, None]:
return get_items(rep, "user", limit) # type: ignore
def parse_tweets(rep: httpx.Response, limit: int = -1) -> Generator[Tweet, None, None]:
return _parse_items(rep, "tweet", limit) # type: ignore
def parse_users(rep: httpx.Response, limit: int = -1) -> Generator[User, None, None]:
return _parse_items(rep, "user", limit) # type: ignore
def parse_user(rep: httpx.Response) -> User | None:
try:
res = rep.json()
return User.parse(to_old_obj(res["data"]["user"]["result"]))
except Exception:
return None
def parse_tweet(rep: httpx.Response, twid: int) -> Tweet | None:
try:
obj = to_old_rep(rep.json())
doc = obj["tweets"].get(str(twid), None)
return Tweet.parse(doc, obj) if doc else None
except Exception:
return None