add accounts pool to perform search queries

Этот коммит содержится в:
Vlad Pronsky 2023-04-23 17:28:56 +03:00
родитель fa16f206f8
Коммит 2ed247af0f
4 изменённых файлов: 126 добавлений и 52 удалений

Просмотреть файл

@ -5,14 +5,20 @@ Twitter GraphQL and Search API implementation with [SNScrape](https://github.com
```python ```python
import asyncio import asyncio
from twapi.client import UserClient from twapi.client import UserClient
from twapi.pool import AccountsPool
from twapi.search import Search from twapi.search import Search
async def main(): async def main():
account = UserClient("user", "pass", "user@example.com", "email_pass") acc1 = UserClient("user1", "pass1", "user1@example.com", "email_pass1")
search = Search(account) acc2 = UserClient("user2", "pass2", "user2@example.com", "email_pass2")
async for rep, data, _ in search.query("elon musk"): pool = AccountsPool()
print(rep.status_code, data) pool.add_account(acc1)
pool.add_account(acc2)
search = Search(pool)
async for rep in search.query("elon musk"):
print(rep.status_code, rep.json())
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())

Просмотреть файл

@ -3,11 +3,10 @@ import imaplib
import json import json
import os import os
import time import time
from collections import defaultdict
from datetime import datetime, timezone from datetime import datetime, timezone
from fake_useragent import UserAgent from fake_useragent import UserAgent
from httpx import Client, HTTPStatusError, Response from httpx import AsyncClient, Client, HTTPStatusError, Response
from loguru import logger from loguru import logger
TOKEN = "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" TOKEN = "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"
@ -198,42 +197,54 @@ class UserClient:
dirname = os.path.dirname(self.session_path) dirname = os.path.dirname(self.session_path)
os.makedirs(dirname, exist_ok=True) os.makedirs(dirname, exist_ok=True)
self.limits = defaultdict(dict) self.limits: dict[str, datetime] = {}
self.locked = False self.locked: dict[str, bool] = {}
def save(self): def save(self):
data = { cookies = dict(self.client.cookies)
"cookies": dict(self.client.cookies), headers = dict(self.client.headers)
"headers": dict(self.client.headers), limits = dict(self.limits)
"limits": dict(self.limits),
}
with open(self.session_path, "w") as fp: with open(self.session_path, "w") as fp:
json.dump(data, fp, indent=2) json.dump({"cookies": cookies, "headers": headers, "limits": limits}, fp, indent=2)
def restore(self): def restore(self):
try: try:
with open(self.session_path) as fp: with open(self.session_path) as fp:
data = json.load(fp) data = json.load(fp)
self.client.cookies.update(data["cookies"]) self.client.cookies.update(data.get("cookies", {}))
self.client.headers.update(data["headers"]) self.client.headers.update(data.get("headers", {}))
self.limits.update(data.get("limits", {})) self.limits.update(
{k: datetime.fromisoformat(v) for k, v in data.get("limits", {}).items()}
)
return True return True
except (FileNotFoundError, json.JSONDecodeError): except (FileNotFoundError, json.JSONDecodeError):
return False return False
def _update_limits(self, rep: Response): def make_client(self) -> AsyncClient:
for k, v in rep.headers.items(): client = AsyncClient()
if k.startswith("x-rate-limit-"): client.headers.update(self.client.headers)
self.limits[rep.url.path][k] = v client.cookies.update(self.client.cookies)
self.save() return client
def check_limits(self, rep: Response, cursor: str | None): def can_use(self, queue: str):
if rep.status_code == 429: if self.locked.get(queue, False):
reset = int(rep.headers.get("x-rate-limit-reset")) return False
reset_time = datetime.fromtimestamp(reset, tz=timezone.utc)
logger.debug(f"Rate limit exceeded for {self.username} - reset at {reset_time}") limit = self.limits.get(queue)
self._update_limits(rep) return not limit or limit <= datetime.now(timezone.utc)
raise RateLimitExceeded(reset, cursor)
def lock(self, queue: str):
self.locked[queue] = True
def unlock(self, queue: str):
self.locked[queue] = False
def update_limit(self, queue: str, rep: Response):
reset = rep.headers.get("x-rate-limit-reset", 0)
reset = datetime.fromtimestamp(int(reset), tz=timezone.utc)
self.limits[queue] = reset
self.save()
def print_session(self): def print_session(self):
for x in self.client.headers.items(): for x in self.client.headers.items():

55
twapi/pool.py Обычный файл
Просмотреть файл

@ -0,0 +1,55 @@
import asyncio
from typing import AsyncGenerator, Callable, Tuple
from httpx import AsyncClient, HTTPStatusError, Response
from loguru import logger
from .client import UserClient
class AccountsPool:
def __init__(self):
self.accounts: list[UserClient] = []
def add_account(self, account: UserClient):
self.accounts.append(account)
def get_account(self, queue: str) -> UserClient | None:
for x in self.accounts:
if x.can_use(queue):
return x
return None
async def execute(
self,
queue: str,
cb: Callable[
[AsyncClient, str | None], AsyncGenerator[Tuple[Response, dict, str | None], None]
],
cursor: str | None = None,
):
while True:
account = self.get_account(queue)
if not account:
logger.debug(f"No accounts available for queue {queue}, sleeping 5 seconds")
await asyncio.sleep(5)
continue
else:
account.lock(queue)
logger.debug(f"Using account {account.username} for queue {queue}")
try:
client = account.make_client()
async for x in cb(client, cursor):
rep, data, cursor = x
yield rep, data, cursor
return # exit if no more results
except HTTPStatusError as e:
if e.response.status_code == 429:
account.update_limit(queue, e.response)
logger.debug(f"Account {account.username} is frozen")
continue
else:
raise e
finally:
account.unlock(queue)

Просмотреть файл

@ -1,7 +1,7 @@
from httpx import AsyncClient, Response from httpx import AsyncClient, Response
from loguru import logger from loguru import logger
from .client import UserClient, raise_for_status from .pool import AccountsPool
BASIC_SEARCH_PARAMS = """ BASIC_SEARCH_PARAMS = """
include_profile_interstitial_type=1 include_profile_interstitial_type=1
@ -45,9 +45,13 @@ SEARCH_URL = "https://api.twitter.com/2/search/adaptive.json"
SEARCH_PARAMS = dict(x.split("=") for x in BASIC_SEARCH_PARAMS.splitlines() if x) SEARCH_PARAMS = dict(x.split("=") for x in BASIC_SEARCH_PARAMS.splitlines() if x)
def rep_info(rep: Response) -> str:
return f"[{rep.headers['x-rate-limit-remaining']}/{rep.headers['x-rate-limit-limit']}]"
class Search: class Search:
def __init__(self, account: UserClient): def __init__(self, pool: AccountsPool):
self.account = account self.pool = pool
def get_next_cursor(self, res: dict) -> str | None: def get_next_cursor(self, res: dict) -> str | None:
try: try:
@ -63,34 +67,32 @@ class Search:
logger.debug(e) logger.debug(e)
return None return None
def _log(self, rep: Response, msg: str): async def get(self, client: AsyncClient, q: str, cursor: str | None):
limit = int(rep.headers.get("x-rate-limit-limit", -1))
remaining = int(rep.headers.get("x-rate-limit-remaining", -1))
logger.debug(f"[{remaining:3,d}/{limit:3,d}] {self.account.username} - {msg}")
async def query(self, q: str, cursor: str | None = None):
client = AsyncClient()
client.headers.update(self.account.client.headers)
client.cookies.update(self.account.client.cookies)
total_count = 0
while True: while True:
params = {**SEARCH_PARAMS, "q": q, "count": 20} params = {**SEARCH_PARAMS, "q": q, "count": 20}
params["cursor" if cursor else "requestContext"] = cursor if cursor else "launch" params["cursor" if cursor else "requestContext"] = cursor if cursor else "launch"
rep = await client.get(SEARCH_URL, params=params) rep = await client.get(SEARCH_URL, params=params)
self.account.check_limits(rep, cursor) rep.raise_for_status()
raise_for_status(rep, "search")
data = rep.json() data = rep.json()
tweets = data.get("globalObjects", {}).get("tweets", [])
cursor = self.get_next_cursor(data) cursor = self.get_next_cursor(data)
# logger.debug(rep.text) tweets = data.get("globalObjects", {}).get("tweets", [])
total_count += len(tweets)
self._log(rep, f"{total_count:,d} (+{len(tweets):,d}) tweets - {q}")
if not tweets or not cursor: if not tweets or not cursor:
is_tweets = len(tweets) > 0
is_cursor = cursor is not None
logger.debug(f"{q} - no more results [res: {is_tweets}, cur: {is_cursor}]")
return return
yield rep.text yield rep, data, cursor
async def query(self, q: str):
total_count = 0
async for x in self.pool.execute("search", lambda c, cur: self.get(c, q, cur)):
rep, data, cursor = x
tweets = data.get("globalObjects", {}).get("tweets", [])
total_count += len(tweets)
logger.debug(f"{q} - {total_count:,d} (+{len(tweets):,d}) {rep_info(rep)}")
yield rep