зеркало из
https://github.com/viginum-datalab/twscrape.git
synced 2025-10-29 13:06:13 +02:00
user_by_id / user_by_login / tweet_details return none in case of resource not accessible api; add dump when parsing failed
Этот коммит содержится в:
родитель
a378a9721e
Коммит
fbbe4b61a9
@ -4,9 +4,9 @@ from httpx import Response
|
||||
from .accounts_pool import AccountsPool
|
||||
from .constants import * # noqa: F403
|
||||
from .logger import set_log_level
|
||||
from .models import Tweet, User, get_tweets, get_users
|
||||
from .models import parse_tweet, parse_tweets, parse_user, parse_users
|
||||
from .queue_client import QueueClient
|
||||
from .utils import encode_params, find_obj, get_by_path, to_old_obj, to_old_rep
|
||||
from .utils import encode_params, find_obj, get_by_path
|
||||
|
||||
SEARCH_FEATURES = {
|
||||
"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": True,
|
||||
@ -95,7 +95,7 @@ class API:
|
||||
|
||||
async def search(self, q: str, limit=-1, kv=None):
|
||||
async for rep in self.search_raw(q, limit=limit, kv=kv):
|
||||
for x in get_tweets(rep.json(), limit):
|
||||
for x in parse_tweets(rep.json(), limit):
|
||||
yield x
|
||||
|
||||
# user_by_id
|
||||
@ -112,8 +112,7 @@ class API:
|
||||
|
||||
async def user_by_id(self, uid: int, kv=None):
|
||||
rep = await self.user_by_id_raw(uid, kv=kv)
|
||||
res = rep.json()
|
||||
return User.parse(to_old_obj(res["data"]["user"]["result"]))
|
||||
return parse_user(rep)
|
||||
|
||||
# user_by_login
|
||||
|
||||
@ -130,8 +129,7 @@ class API:
|
||||
|
||||
async def user_by_login(self, login: str, kv=None):
|
||||
rep = await self.user_by_login_raw(login, kv=kv)
|
||||
res = rep.json()
|
||||
return User.parse(to_old_obj(res["data"]["user"]["result"]))
|
||||
return parse_user(rep)
|
||||
|
||||
# tweet_details
|
||||
|
||||
@ -163,9 +161,7 @@ class API:
|
||||
|
||||
async def tweet_details(self, twid: int, kv=None):
|
||||
rep = await self.tweet_details_raw(twid, kv=kv)
|
||||
obj = to_old_rep(rep.json())
|
||||
doc = obj["tweets"].get(str(twid), None)
|
||||
return Tweet.parse(doc, obj) if doc else None
|
||||
return parse_tweet(rep, twid)
|
||||
|
||||
# followers
|
||||
|
||||
@ -177,7 +173,7 @@ class API:
|
||||
|
||||
async def followers(self, uid: int, limit=-1, kv=None):
|
||||
async for rep in self.followers_raw(uid, limit=limit, kv=kv):
|
||||
for x in get_users(rep.json(), limit):
|
||||
for x in parse_users(rep.json(), limit):
|
||||
yield x
|
||||
|
||||
# following
|
||||
@ -190,7 +186,7 @@ class API:
|
||||
|
||||
async def following(self, uid: int, limit=-1, kv=None):
|
||||
async for rep in self.following_raw(uid, limit=limit, kv=kv):
|
||||
for x in get_users(rep.json(), limit):
|
||||
for x in parse_users(rep.json(), limit):
|
||||
yield x
|
||||
|
||||
# retweeters
|
||||
@ -203,7 +199,7 @@ class API:
|
||||
|
||||
async def retweeters(self, twid: int, limit=-1, kv=None):
|
||||
async for rep in self.retweeters_raw(twid, limit=limit, kv=kv):
|
||||
for x in get_users(rep.json(), limit):
|
||||
for x in parse_users(rep.json(), limit):
|
||||
yield x
|
||||
|
||||
# favoriters
|
||||
@ -216,7 +212,7 @@ class API:
|
||||
|
||||
async def favoriters(self, twid: int, limit=-1, kv=None):
|
||||
async for rep in self.favoriters_raw(twid, limit=limit, kv=kv):
|
||||
for x in get_users(rep.json(), limit):
|
||||
for x in parse_users(rep.json(), limit):
|
||||
yield x
|
||||
|
||||
# user_tweets
|
||||
@ -237,7 +233,7 @@ class API:
|
||||
|
||||
async def user_tweets(self, uid: int, limit=-1, kv=None):
|
||||
async for rep in self.user_tweets_raw(uid, limit=limit, kv=kv):
|
||||
for x in get_tweets(rep.json(), limit):
|
||||
for x in parse_tweets(rep.json(), limit):
|
||||
yield x
|
||||
|
||||
# user_tweets_and_replies
|
||||
@ -258,7 +254,7 @@ class API:
|
||||
|
||||
async def user_tweets_and_replies(self, uid: int, limit=-1, kv=None):
|
||||
async for rep in self.user_tweets_and_replies_raw(uid, limit=limit, kv=kv):
|
||||
for x in get_tweets(rep.json(), limit):
|
||||
for x in parse_tweets(rep.json(), limit):
|
||||
yield x
|
||||
|
||||
# list timeline
|
||||
@ -275,5 +271,5 @@ class API:
|
||||
|
||||
async def list_timeline(self, list_id: int, limit=-1, kv=None):
|
||||
async for rep in self.list_timeline_raw(list_id, limit=limit, kv=kv):
|
||||
for x in get_tweets(rep, limit):
|
||||
for x in parse_tweets(rep, limit):
|
||||
yield x
|
||||
|
||||
@ -1,6 +1,10 @@
|
||||
import email.utils
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import string
|
||||
import traceback
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Generator, Optional
|
||||
@ -8,7 +12,7 @@ from typing import Generator, Optional
|
||||
import httpx
|
||||
|
||||
from .logger import logger
|
||||
from .utils import find_item, get_or, int_or_none, to_old_rep
|
||||
from .utils import find_item, get_or, int_or_none, to_old_obj, to_old_rep
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -323,6 +327,9 @@ class Media(JSONTrait):
|
||||
return Media(photos=photos, videos=videos, animated=animated)
|
||||
|
||||
|
||||
# internal helpers
|
||||
|
||||
|
||||
def _get_reply_user(tw_obj: dict, res: dict):
|
||||
user_id = tw_obj.get("in_reply_to_user_id_str", None)
|
||||
if user_id is None:
|
||||
@ -381,10 +388,25 @@ def _get_views(obj: dict, rt_obj: dict):
|
||||
return None
|
||||
|
||||
|
||||
# reply parsing
|
||||
def _write_dump(kind: str, e: Exception, x: dict, obj: dict):
|
||||
uniq = "".join(random.choice(string.ascii_lowercase) for _ in range(5))
|
||||
time = datetime.utcnow().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
dumpfile = f"/tmp/twscrape/twscrape_parse_error_{time}_{uniq}.txt"
|
||||
os.makedirs(os.path.dirname(dumpfile), exist_ok=True)
|
||||
|
||||
with open(dumpfile, "w") as fp:
|
||||
msg = [
|
||||
f"Error parsing {kind}. Error: {type(e)}",
|
||||
traceback.format_exc(),
|
||||
json.dumps(x, default=str),
|
||||
json.dumps(obj, default=str),
|
||||
]
|
||||
fp.write("\n\n".join(msg))
|
||||
|
||||
logger.error(f"Failed to parse response of {kind}, writing dump to {dumpfile}")
|
||||
|
||||
|
||||
def get_items(rep: httpx.Response, kind: str, limit: int = -1):
|
||||
def _parse_items(rep: httpx.Response, kind: str, limit: int = -1):
|
||||
if kind == "user":
|
||||
Cls = User
|
||||
key = "users"
|
||||
@ -400,15 +422,39 @@ def get_items(rep: httpx.Response, kind: str, limit: int = -1):
|
||||
if limit != -1 and len(ids) >= limit:
|
||||
break
|
||||
|
||||
tmp = Cls.parse(x, obj)
|
||||
if tmp.id not in ids:
|
||||
ids.add(tmp.id)
|
||||
yield tmp
|
||||
try:
|
||||
tmp = Cls.parse(x, obj)
|
||||
if tmp.id not in ids:
|
||||
ids.add(tmp.id)
|
||||
yield tmp
|
||||
except Exception as e:
|
||||
_write_dump(kind, e, x, obj)
|
||||
continue
|
||||
|
||||
|
||||
def get_tweets(rep: httpx.Response, limit: int = -1) -> Generator[Tweet, None, None]:
|
||||
return get_items(rep, "tweet", limit) # type: ignore
|
||||
# public helpers
|
||||
|
||||
|
||||
def get_users(rep: httpx.Response, limit: int = -1) -> Generator[User, None, None]:
|
||||
return get_items(rep, "user", limit) # type: ignore
|
||||
def parse_tweets(rep: httpx.Response, limit: int = -1) -> Generator[Tweet, None, None]:
|
||||
return _parse_items(rep, "tweet", limit) # type: ignore
|
||||
|
||||
|
||||
def parse_users(rep: httpx.Response, limit: int = -1) -> Generator[User, None, None]:
|
||||
return _parse_items(rep, "user", limit) # type: ignore
|
||||
|
||||
|
||||
def parse_user(rep: httpx.Response) -> User | None:
|
||||
try:
|
||||
res = rep.json()
|
||||
return User.parse(to_old_obj(res["data"]["user"]["result"]))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def parse_tweet(rep: httpx.Response, twid: int) -> Tweet | None:
|
||||
try:
|
||||
obj = to_old_rep(rep.json())
|
||||
doc = obj["tweets"].get(str(twid), None)
|
||||
return Tweet.parse(doc, obj) if doc else None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
Загрузка…
x
Ссылка в новой задаче
Block a user