зеркало из
				https://github.com/viginum-datalab/twscrape.git
				synced 2025-10-30 21:46:13 +02:00 
			
		
		
		
	add exception on rate limit & save it to session dump
Этот коммит содержится в:
		
							родитель
							
								
									5f4b0d4c4e
								
							
						
					
					
						Коммит
						fa16f206f8
					
				| @ -3,6 +3,8 @@ import imaplib | |||||||
| import json | import json | ||||||
| import os | import os | ||||||
| import time | import time | ||||||
|  | from collections import defaultdict | ||||||
|  | from datetime import datetime, timezone | ||||||
| 
 | 
 | ||||||
| from fake_useragent import UserAgent | from fake_useragent import UserAgent | ||||||
| from httpx import Client, HTTPStatusError, Response | from httpx import Client, HTTPStatusError, Response | ||||||
| @ -12,6 +14,12 @@ TOKEN = "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Z | |||||||
| TASK_URL = "https://api.twitter.com/1.1/onboarding/task.json" | TASK_URL = "https://api.twitter.com/1.1/onboarding/task.json" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | class RateLimitExceeded(Exception): | ||||||
|  |     def __init__(self, reset: int, cursor: str | None = None): | ||||||
|  |         self.reset = reset | ||||||
|  |         self.cursor = cursor | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def search_email_with_confirmation_code(imap: imaplib.IMAP4_SSL, msg_count: int) -> str | None: | def search_email_with_confirmation_code(imap: imaplib.IMAP4_SSL, msg_count: int) -> str | None: | ||||||
|     for i in range(msg_count, 0, -1): |     for i in range(msg_count, 0, -1): | ||||||
|         _, rep = imap.fetch(str(i), "(RFC822)") |         _, rep = imap.fetch(str(i), "(RFC822)") | ||||||
| @ -190,11 +198,17 @@ class UserClient: | |||||||
|         dirname = os.path.dirname(self.session_path) |         dirname = os.path.dirname(self.session_path) | ||||||
|         os.makedirs(dirname, exist_ok=True) |         os.makedirs(dirname, exist_ok=True) | ||||||
| 
 | 
 | ||||||
|  |         self.limits = defaultdict(dict) | ||||||
|  |         self.locked = False | ||||||
|  | 
 | ||||||
|     def save(self): |     def save(self): | ||||||
|         cookies = dict(self.client.cookies) |         data = { | ||||||
|         headers = dict(self.client.headers) |             "cookies": dict(self.client.cookies), | ||||||
|  |             "headers": dict(self.client.headers), | ||||||
|  |             "limits": dict(self.limits), | ||||||
|  |         } | ||||||
|         with open(self.session_path, "w") as fp: |         with open(self.session_path, "w") as fp: | ||||||
|             json.dump({"cookies": cookies, "headers": headers}, fp) |             json.dump(data, fp, indent=2) | ||||||
| 
 | 
 | ||||||
|     def restore(self): |     def restore(self): | ||||||
|         try: |         try: | ||||||
| @ -202,10 +216,25 @@ class UserClient: | |||||||
|                 data = json.load(fp) |                 data = json.load(fp) | ||||||
|                 self.client.cookies.update(data["cookies"]) |                 self.client.cookies.update(data["cookies"]) | ||||||
|                 self.client.headers.update(data["headers"]) |                 self.client.headers.update(data["headers"]) | ||||||
|  |                 self.limits.update(data.get("limits", {})) | ||||||
|                 return True |                 return True | ||||||
|         except (FileNotFoundError, json.JSONDecodeError): |         except (FileNotFoundError, json.JSONDecodeError): | ||||||
|             return False |             return False | ||||||
| 
 | 
 | ||||||
|  |     def _update_limits(self, rep: Response): | ||||||
|  |         for k, v in rep.headers.items(): | ||||||
|  |             if k.startswith("x-rate-limit-"): | ||||||
|  |                 self.limits[rep.url.path][k] = v | ||||||
|  |         self.save() | ||||||
|  | 
 | ||||||
|  |     def check_limits(self, rep: Response, cursor: str | None): | ||||||
|  |         if rep.status_code == 429: | ||||||
|  |             reset = int(rep.headers.get("x-rate-limit-reset")) | ||||||
|  |             reset_time = datetime.fromtimestamp(reset, tz=timezone.utc) | ||||||
|  |             logger.debug(f"Rate limit exceeded for {self.username} - reset at {reset_time}") | ||||||
|  |             self._update_limits(rep) | ||||||
|  |             raise RateLimitExceeded(reset, cursor) | ||||||
|  | 
 | ||||||
|     def print_session(self): |     def print_session(self): | ||||||
|         for x in self.client.headers.items(): |         for x in self.client.headers.items(): | ||||||
|             print(x) |             print(x) | ||||||
|  | |||||||
| @ -1,7 +1,7 @@ | |||||||
| from httpx import AsyncClient | from httpx import AsyncClient, Response | ||||||
| from loguru import logger | from loguru import logger | ||||||
| 
 | 
 | ||||||
| from .client import UserClient | from .client import UserClient, raise_for_status | ||||||
| 
 | 
 | ||||||
| BASIC_SEARCH_PARAMS = """ | BASIC_SEARCH_PARAMS = """ | ||||||
| include_profile_interstitial_type=1 | include_profile_interstitial_type=1 | ||||||
| @ -63,6 +63,11 @@ class Search: | |||||||
|             logger.debug(e) |             logger.debug(e) | ||||||
|             return None |             return None | ||||||
| 
 | 
 | ||||||
|  |     def _log(self, rep: Response, msg: str): | ||||||
|  |         limit = int(rep.headers.get("x-rate-limit-limit", -1)) | ||||||
|  |         remaining = int(rep.headers.get("x-rate-limit-remaining", -1)) | ||||||
|  |         logger.debug(f"[{remaining:3,d}/{limit:3,d}] {self.account.username} - {msg}") | ||||||
|  | 
 | ||||||
|     async def query(self, q: str, cursor: str | None = None): |     async def query(self, q: str, cursor: str | None = None): | ||||||
|         client = AsyncClient() |         client = AsyncClient() | ||||||
|         client.headers.update(self.account.client.headers) |         client.headers.update(self.account.client.headers) | ||||||
| @ -74,13 +79,18 @@ class Search: | |||||||
|             params["cursor" if cursor else "requestContext"] = cursor if cursor else "launch" |             params["cursor" if cursor else "requestContext"] = cursor if cursor else "launch" | ||||||
| 
 | 
 | ||||||
|             rep = await client.get(SEARCH_URL, params=params) |             rep = await client.get(SEARCH_URL, params=params) | ||||||
|             rep.raise_for_status() |             self.account.check_limits(rep, cursor) | ||||||
|  |             raise_for_status(rep, "search") | ||||||
| 
 | 
 | ||||||
|             data = rep.json() |             data = rep.json() | ||||||
|             cursor = self.get_next_cursor(data) |  | ||||||
|             tweets = data.get("globalObjects", {}).get("tweets", []) |             tweets = data.get("globalObjects", {}).get("tweets", []) | ||||||
|  |             cursor = self.get_next_cursor(data) | ||||||
|  |             # logger.debug(rep.text) | ||||||
|  | 
 | ||||||
|  |             total_count += len(tweets) | ||||||
|  |             self._log(rep, f"{total_count:,d} (+{len(tweets):,d}) tweets - {q}") | ||||||
|  | 
 | ||||||
|             if not tweets or not cursor: |             if not tweets or not cursor: | ||||||
|                 return |                 return | ||||||
| 
 | 
 | ||||||
|             total_count += len(tweets) |             yield rep.text | ||||||
|             yield rep, data, cursor |  | ||||||
|  | |||||||
		Загрузка…
	
	
			
			x
			
			
		
	
		Ссылка в новой задаче
	
	Block a user
	 Vlad Pronsky
						Vlad Pronsky