diff --git a/twscrape/api.py b/twscrape/api.py index fe79eed..5d8c97e 100644 --- a/twscrape/api.py +++ b/twscrape/api.py @@ -4,9 +4,9 @@ from httpx import Response from .accounts_pool import AccountsPool from .constants import * # noqa: F403 from .logger import set_log_level -from .models import Tweet, User, get_tweets, get_users +from .models import parse_tweet, parse_tweets, parse_user, parse_users from .queue_client import QueueClient -from .utils import encode_params, find_obj, get_by_path, to_old_obj, to_old_rep +from .utils import encode_params, find_obj, get_by_path SEARCH_FEATURES = { "tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": True, @@ -95,7 +95,7 @@ class API: async def search(self, q: str, limit=-1, kv=None): async for rep in self.search_raw(q, limit=limit, kv=kv): - for x in get_tweets(rep.json(), limit): + for x in parse_tweets(rep.json(), limit): yield x # user_by_id @@ -112,8 +112,7 @@ class API: async def user_by_id(self, uid: int, kv=None): rep = await self.user_by_id_raw(uid, kv=kv) - res = rep.json() - return User.parse(to_old_obj(res["data"]["user"]["result"])) + return parse_user(rep) # user_by_login @@ -130,8 +129,7 @@ class API: async def user_by_login(self, login: str, kv=None): rep = await self.user_by_login_raw(login, kv=kv) - res = rep.json() - return User.parse(to_old_obj(res["data"]["user"]["result"])) + return parse_user(rep) # tweet_details @@ -163,9 +161,7 @@ class API: async def tweet_details(self, twid: int, kv=None): rep = await self.tweet_details_raw(twid, kv=kv) - obj = to_old_rep(rep.json()) - doc = obj["tweets"].get(str(twid), None) - return Tweet.parse(doc, obj) if doc else None + return parse_tweet(rep, twid) # followers @@ -177,7 +173,7 @@ class API: async def followers(self, uid: int, limit=-1, kv=None): async for rep in self.followers_raw(uid, limit=limit, kv=kv): - for x in get_users(rep.json(), limit): + for x in parse_users(rep.json(), limit): yield x # following @@ -190,7 +186,7 @@ class API: async def following(self, uid: int, limit=-1, kv=None): async for rep in self.following_raw(uid, limit=limit, kv=kv): - for x in get_users(rep.json(), limit): + for x in parse_users(rep.json(), limit): yield x # retweeters @@ -203,7 +199,7 @@ class API: async def retweeters(self, twid: int, limit=-1, kv=None): async for rep in self.retweeters_raw(twid, limit=limit, kv=kv): - for x in get_users(rep.json(), limit): + for x in parse_users(rep.json(), limit): yield x # favoriters @@ -216,7 +212,7 @@ class API: async def favoriters(self, twid: int, limit=-1, kv=None): async for rep in self.favoriters_raw(twid, limit=limit, kv=kv): - for x in get_users(rep.json(), limit): + for x in parse_users(rep.json(), limit): yield x # user_tweets @@ -237,7 +233,7 @@ class API: async def user_tweets(self, uid: int, limit=-1, kv=None): async for rep in self.user_tweets_raw(uid, limit=limit, kv=kv): - for x in get_tweets(rep.json(), limit): + for x in parse_tweets(rep.json(), limit): yield x # user_tweets_and_replies @@ -258,7 +254,7 @@ class API: async def user_tweets_and_replies(self, uid: int, limit=-1, kv=None): async for rep in self.user_tweets_and_replies_raw(uid, limit=limit, kv=kv): - for x in get_tweets(rep.json(), limit): + for x in parse_tweets(rep.json(), limit): yield x # list timeline @@ -275,5 +271,5 @@ class API: async def list_timeline(self, list_id: int, limit=-1, kv=None): async for rep in self.list_timeline_raw(list_id, limit=limit, kv=kv): - for x in get_tweets(rep, limit): + for x in parse_tweets(rep, limit): yield x diff --git a/twscrape/models.py b/twscrape/models.py index f506851..96ae995 100644 --- a/twscrape/models.py +++ b/twscrape/models.py @@ -1,6 +1,10 @@ import email.utils import json +import os +import random import re +import string +import traceback from dataclasses import asdict, dataclass, field from datetime import datetime from typing import Generator, Optional @@ -8,7 +12,7 @@ from typing import Generator, Optional import httpx from .logger import logger -from .utils import find_item, get_or, int_or_none, to_old_rep +from .utils import find_item, get_or, int_or_none, to_old_obj, to_old_rep @dataclass @@ -323,6 +327,9 @@ class Media(JSONTrait): return Media(photos=photos, videos=videos, animated=animated) +# internal helpers + + def _get_reply_user(tw_obj: dict, res: dict): user_id = tw_obj.get("in_reply_to_user_id_str", None) if user_id is None: @@ -381,10 +388,25 @@ def _get_views(obj: dict, rt_obj: dict): return None -# reply parsing +def _write_dump(kind: str, e: Exception, x: dict, obj: dict): + uniq = "".join(random.choice(string.ascii_lowercase) for _ in range(5)) + time = datetime.utcnow().strftime("%Y-%m-%d_%H-%M-%S") + dumpfile = f"/tmp/twscrape/twscrape_parse_error_{time}_{uniq}.txt" + os.makedirs(os.path.dirname(dumpfile), exist_ok=True) + + with open(dumpfile, "w") as fp: + msg = [ + f"Error parsing {kind}. Error: {type(e)}", + traceback.format_exc(), + json.dumps(x, default=str), + json.dumps(obj, default=str), + ] + fp.write("\n\n".join(msg)) + + logger.error(f"Failed to parse response of {kind}, writing dump to {dumpfile}") -def get_items(rep: httpx.Response, kind: str, limit: int = -1): +def _parse_items(rep: httpx.Response, kind: str, limit: int = -1): if kind == "user": Cls = User key = "users" @@ -400,15 +422,39 @@ def get_items(rep: httpx.Response, kind: str, limit: int = -1): if limit != -1 and len(ids) >= limit: break - tmp = Cls.parse(x, obj) - if tmp.id not in ids: - ids.add(tmp.id) - yield tmp + try: + tmp = Cls.parse(x, obj) + if tmp.id not in ids: + ids.add(tmp.id) + yield tmp + except Exception as e: + _write_dump(kind, e, x, obj) + continue -def get_tweets(rep: httpx.Response, limit: int = -1) -> Generator[Tweet, None, None]: - return get_items(rep, "tweet", limit) # type: ignore +# public helpers -def get_users(rep: httpx.Response, limit: int = -1) -> Generator[User, None, None]: - return get_items(rep, "user", limit) # type: ignore +def parse_tweets(rep: httpx.Response, limit: int = -1) -> Generator[Tweet, None, None]: + return _parse_items(rep, "tweet", limit) # type: ignore + + +def parse_users(rep: httpx.Response, limit: int = -1) -> Generator[User, None, None]: + return _parse_items(rep, "user", limit) # type: ignore + + +def parse_user(rep: httpx.Response) -> User | None: + try: + res = rep.json() + return User.parse(to_old_obj(res["data"]["user"]["result"])) + except Exception: + return None + + +def parse_tweet(rep: httpx.Response, twid: int) -> Tweet | None: + try: + obj = to_old_rep(rep.json()) + doc = obj["tweets"].get(str(twid), None) + return Tweet.parse(doc, obj) if doc else None + except Exception: + return None