зеркало из
https://github.com/viginum-datalab/twscrape.git
synced 2025-10-29 21:16:25 +02:00
add exception on rate limit & save it to session dump
Этот коммит содержится в:
родитель
5f4b0d4c4e
Коммит
fa16f206f8
@ -3,6 +3,8 @@ import imaplib
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from fake_useragent import UserAgent
|
||||
from httpx import Client, HTTPStatusError, Response
|
||||
@ -12,6 +14,12 @@ TOKEN = "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Z
|
||||
TASK_URL = "https://api.twitter.com/1.1/onboarding/task.json"
|
||||
|
||||
|
||||
class RateLimitExceeded(Exception):
|
||||
def __init__(self, reset: int, cursor: str | None = None):
|
||||
self.reset = reset
|
||||
self.cursor = cursor
|
||||
|
||||
|
||||
def search_email_with_confirmation_code(imap: imaplib.IMAP4_SSL, msg_count: int) -> str | None:
|
||||
for i in range(msg_count, 0, -1):
|
||||
_, rep = imap.fetch(str(i), "(RFC822)")
|
||||
@ -190,11 +198,17 @@ class UserClient:
|
||||
dirname = os.path.dirname(self.session_path)
|
||||
os.makedirs(dirname, exist_ok=True)
|
||||
|
||||
self.limits = defaultdict(dict)
|
||||
self.locked = False
|
||||
|
||||
def save(self):
|
||||
cookies = dict(self.client.cookies)
|
||||
headers = dict(self.client.headers)
|
||||
data = {
|
||||
"cookies": dict(self.client.cookies),
|
||||
"headers": dict(self.client.headers),
|
||||
"limits": dict(self.limits),
|
||||
}
|
||||
with open(self.session_path, "w") as fp:
|
||||
json.dump({"cookies": cookies, "headers": headers}, fp)
|
||||
json.dump(data, fp, indent=2)
|
||||
|
||||
def restore(self):
|
||||
try:
|
||||
@ -202,10 +216,25 @@ class UserClient:
|
||||
data = json.load(fp)
|
||||
self.client.cookies.update(data["cookies"])
|
||||
self.client.headers.update(data["headers"])
|
||||
self.limits.update(data.get("limits", {}))
|
||||
return True
|
||||
except (FileNotFoundError, json.JSONDecodeError):
|
||||
return False
|
||||
|
||||
def _update_limits(self, rep: Response):
|
||||
for k, v in rep.headers.items():
|
||||
if k.startswith("x-rate-limit-"):
|
||||
self.limits[rep.url.path][k] = v
|
||||
self.save()
|
||||
|
||||
def check_limits(self, rep: Response, cursor: str | None):
|
||||
if rep.status_code == 429:
|
||||
reset = int(rep.headers.get("x-rate-limit-reset"))
|
||||
reset_time = datetime.fromtimestamp(reset, tz=timezone.utc)
|
||||
logger.debug(f"Rate limit exceeded for {self.username} - reset at {reset_time}")
|
||||
self._update_limits(rep)
|
||||
raise RateLimitExceeded(reset, cursor)
|
||||
|
||||
def print_session(self):
|
||||
for x in self.client.headers.items():
|
||||
print(x)
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
from httpx import AsyncClient
|
||||
from httpx import AsyncClient, Response
|
||||
from loguru import logger
|
||||
|
||||
from .client import UserClient
|
||||
from .client import UserClient, raise_for_status
|
||||
|
||||
BASIC_SEARCH_PARAMS = """
|
||||
include_profile_interstitial_type=1
|
||||
@ -63,6 +63,11 @@ class Search:
|
||||
logger.debug(e)
|
||||
return None
|
||||
|
||||
def _log(self, rep: Response, msg: str):
|
||||
limit = int(rep.headers.get("x-rate-limit-limit", -1))
|
||||
remaining = int(rep.headers.get("x-rate-limit-remaining", -1))
|
||||
logger.debug(f"[{remaining:3,d}/{limit:3,d}] {self.account.username} - {msg}")
|
||||
|
||||
async def query(self, q: str, cursor: str | None = None):
|
||||
client = AsyncClient()
|
||||
client.headers.update(self.account.client.headers)
|
||||
@ -74,13 +79,18 @@ class Search:
|
||||
params["cursor" if cursor else "requestContext"] = cursor if cursor else "launch"
|
||||
|
||||
rep = await client.get(SEARCH_URL, params=params)
|
||||
rep.raise_for_status()
|
||||
self.account.check_limits(rep, cursor)
|
||||
raise_for_status(rep, "search")
|
||||
|
||||
data = rep.json()
|
||||
cursor = self.get_next_cursor(data)
|
||||
tweets = data.get("globalObjects", {}).get("tweets", [])
|
||||
cursor = self.get_next_cursor(data)
|
||||
# logger.debug(rep.text)
|
||||
|
||||
total_count += len(tweets)
|
||||
self._log(rep, f"{total_count:,d} (+{len(tweets):,d}) tweets - {q}")
|
||||
|
||||
if not tweets or not cursor:
|
||||
return
|
||||
|
||||
total_count += len(tweets)
|
||||
yield rep, data, cursor
|
||||
yield rep.text
|
||||
|
||||
Загрузка…
x
Ссылка в новой задаче
Block a user