зеркало из
https://github.com/viginum-datalab/twscrape.git
synced 2025-10-29 21:16:25 +02:00
add accounts pool to perform search queries
Этот коммит содержится в:
родитель
fa16f206f8
Коммит
2ed247af0f
14
readme.md
14
readme.md
@ -5,14 +5,20 @@ Twitter GraphQL and Search API implementation with [SNScrape](https://github.com
|
|||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from twapi.client import UserClient
|
from twapi.client import UserClient
|
||||||
|
from twapi.pool import AccountsPool
|
||||||
from twapi.search import Search
|
from twapi.search import Search
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
account = UserClient("user", "pass", "user@example.com", "email_pass")
|
acc1 = UserClient("user1", "pass1", "user1@example.com", "email_pass1")
|
||||||
search = Search(account)
|
acc2 = UserClient("user2", "pass2", "user2@example.com", "email_pass2")
|
||||||
|
|
||||||
async for rep, data, _ in search.query("elon musk"):
|
pool = AccountsPool()
|
||||||
print(rep.status_code, data)
|
pool.add_account(acc1)
|
||||||
|
pool.add_account(acc2)
|
||||||
|
|
||||||
|
search = Search(pool)
|
||||||
|
async for rep in search.query("elon musk"):
|
||||||
|
print(rep.status_code, rep.json())
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
|||||||
@ -3,11 +3,10 @@ import imaplib
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from collections import defaultdict
|
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
from fake_useragent import UserAgent
|
from fake_useragent import UserAgent
|
||||||
from httpx import Client, HTTPStatusError, Response
|
from httpx import AsyncClient, Client, HTTPStatusError, Response
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
TOKEN = "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"
|
TOKEN = "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"
|
||||||
@ -198,42 +197,54 @@ class UserClient:
|
|||||||
dirname = os.path.dirname(self.session_path)
|
dirname = os.path.dirname(self.session_path)
|
||||||
os.makedirs(dirname, exist_ok=True)
|
os.makedirs(dirname, exist_ok=True)
|
||||||
|
|
||||||
self.limits = defaultdict(dict)
|
self.limits: dict[str, datetime] = {}
|
||||||
self.locked = False
|
self.locked: dict[str, bool] = {}
|
||||||
|
|
||||||
def save(self):
|
def save(self):
|
||||||
data = {
|
cookies = dict(self.client.cookies)
|
||||||
"cookies": dict(self.client.cookies),
|
headers = dict(self.client.headers)
|
||||||
"headers": dict(self.client.headers),
|
limits = dict(self.limits)
|
||||||
"limits": dict(self.limits),
|
|
||||||
}
|
|
||||||
with open(self.session_path, "w") as fp:
|
with open(self.session_path, "w") as fp:
|
||||||
json.dump(data, fp, indent=2)
|
json.dump({"cookies": cookies, "headers": headers, "limits": limits}, fp, indent=2)
|
||||||
|
|
||||||
def restore(self):
|
def restore(self):
|
||||||
try:
|
try:
|
||||||
with open(self.session_path) as fp:
|
with open(self.session_path) as fp:
|
||||||
data = json.load(fp)
|
data = json.load(fp)
|
||||||
self.client.cookies.update(data["cookies"])
|
self.client.cookies.update(data.get("cookies", {}))
|
||||||
self.client.headers.update(data["headers"])
|
self.client.headers.update(data.get("headers", {}))
|
||||||
self.limits.update(data.get("limits", {}))
|
self.limits.update(
|
||||||
|
{k: datetime.fromisoformat(v) for k, v in data.get("limits", {}).items()}
|
||||||
|
)
|
||||||
return True
|
return True
|
||||||
except (FileNotFoundError, json.JSONDecodeError):
|
except (FileNotFoundError, json.JSONDecodeError):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _update_limits(self, rep: Response):
|
def make_client(self) -> AsyncClient:
|
||||||
for k, v in rep.headers.items():
|
client = AsyncClient()
|
||||||
if k.startswith("x-rate-limit-"):
|
client.headers.update(self.client.headers)
|
||||||
self.limits[rep.url.path][k] = v
|
client.cookies.update(self.client.cookies)
|
||||||
self.save()
|
return client
|
||||||
|
|
||||||
def check_limits(self, rep: Response, cursor: str | None):
|
def can_use(self, queue: str):
|
||||||
if rep.status_code == 429:
|
if self.locked.get(queue, False):
|
||||||
reset = int(rep.headers.get("x-rate-limit-reset"))
|
return False
|
||||||
reset_time = datetime.fromtimestamp(reset, tz=timezone.utc)
|
|
||||||
logger.debug(f"Rate limit exceeded for {self.username} - reset at {reset_time}")
|
limit = self.limits.get(queue)
|
||||||
self._update_limits(rep)
|
return not limit or limit <= datetime.now(timezone.utc)
|
||||||
raise RateLimitExceeded(reset, cursor)
|
|
||||||
|
def lock(self, queue: str):
|
||||||
|
self.locked[queue] = True
|
||||||
|
|
||||||
|
def unlock(self, queue: str):
|
||||||
|
self.locked[queue] = False
|
||||||
|
|
||||||
|
def update_limit(self, queue: str, rep: Response):
|
||||||
|
reset = rep.headers.get("x-rate-limit-reset", 0)
|
||||||
|
reset = datetime.fromtimestamp(int(reset), tz=timezone.utc)
|
||||||
|
self.limits[queue] = reset
|
||||||
|
self.save()
|
||||||
|
|
||||||
def print_session(self):
|
def print_session(self):
|
||||||
for x in self.client.headers.items():
|
for x in self.client.headers.items():
|
||||||
|
|||||||
55
twapi/pool.py
Обычный файл
55
twapi/pool.py
Обычный файл
@ -0,0 +1,55 @@
|
|||||||
|
import asyncio
|
||||||
|
from typing import AsyncGenerator, Callable, Tuple
|
||||||
|
|
||||||
|
from httpx import AsyncClient, HTTPStatusError, Response
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from .client import UserClient
|
||||||
|
|
||||||
|
|
||||||
|
class AccountsPool:
|
||||||
|
def __init__(self):
|
||||||
|
self.accounts: list[UserClient] = []
|
||||||
|
|
||||||
|
def add_account(self, account: UserClient):
|
||||||
|
self.accounts.append(account)
|
||||||
|
|
||||||
|
def get_account(self, queue: str) -> UserClient | None:
|
||||||
|
for x in self.accounts:
|
||||||
|
if x.can_use(queue):
|
||||||
|
return x
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def execute(
|
||||||
|
self,
|
||||||
|
queue: str,
|
||||||
|
cb: Callable[
|
||||||
|
[AsyncClient, str | None], AsyncGenerator[Tuple[Response, dict, str | None], None]
|
||||||
|
],
|
||||||
|
cursor: str | None = None,
|
||||||
|
):
|
||||||
|
while True:
|
||||||
|
account = self.get_account(queue)
|
||||||
|
if not account:
|
||||||
|
logger.debug(f"No accounts available for queue {queue}, sleeping 5 seconds")
|
||||||
|
await asyncio.sleep(5)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
account.lock(queue)
|
||||||
|
logger.debug(f"Using account {account.username} for queue {queue}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
client = account.make_client()
|
||||||
|
async for x in cb(client, cursor):
|
||||||
|
rep, data, cursor = x
|
||||||
|
yield rep, data, cursor
|
||||||
|
return # exit if no more results
|
||||||
|
except HTTPStatusError as e:
|
||||||
|
if e.response.status_code == 429:
|
||||||
|
account.update_limit(queue, e.response)
|
||||||
|
logger.debug(f"Account {account.username} is frozen")
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
finally:
|
||||||
|
account.unlock(queue)
|
||||||
@ -1,7 +1,7 @@
|
|||||||
from httpx import AsyncClient, Response
|
from httpx import AsyncClient, Response
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from .client import UserClient, raise_for_status
|
from .pool import AccountsPool
|
||||||
|
|
||||||
BASIC_SEARCH_PARAMS = """
|
BASIC_SEARCH_PARAMS = """
|
||||||
include_profile_interstitial_type=1
|
include_profile_interstitial_type=1
|
||||||
@ -45,9 +45,13 @@ SEARCH_URL = "https://api.twitter.com/2/search/adaptive.json"
|
|||||||
SEARCH_PARAMS = dict(x.split("=") for x in BASIC_SEARCH_PARAMS.splitlines() if x)
|
SEARCH_PARAMS = dict(x.split("=") for x in BASIC_SEARCH_PARAMS.splitlines() if x)
|
||||||
|
|
||||||
|
|
||||||
|
def rep_info(rep: Response) -> str:
|
||||||
|
return f"[{rep.headers['x-rate-limit-remaining']}/{rep.headers['x-rate-limit-limit']}]"
|
||||||
|
|
||||||
|
|
||||||
class Search:
|
class Search:
|
||||||
def __init__(self, account: UserClient):
|
def __init__(self, pool: AccountsPool):
|
||||||
self.account = account
|
self.pool = pool
|
||||||
|
|
||||||
def get_next_cursor(self, res: dict) -> str | None:
|
def get_next_cursor(self, res: dict) -> str | None:
|
||||||
try:
|
try:
|
||||||
@ -63,34 +67,32 @@ class Search:
|
|||||||
logger.debug(e)
|
logger.debug(e)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _log(self, rep: Response, msg: str):
|
async def get(self, client: AsyncClient, q: str, cursor: str | None):
|
||||||
limit = int(rep.headers.get("x-rate-limit-limit", -1))
|
|
||||||
remaining = int(rep.headers.get("x-rate-limit-remaining", -1))
|
|
||||||
logger.debug(f"[{remaining:3,d}/{limit:3,d}] {self.account.username} - {msg}")
|
|
||||||
|
|
||||||
async def query(self, q: str, cursor: str | None = None):
|
|
||||||
client = AsyncClient()
|
|
||||||
client.headers.update(self.account.client.headers)
|
|
||||||
client.cookies.update(self.account.client.cookies)
|
|
||||||
|
|
||||||
total_count = 0
|
|
||||||
while True:
|
while True:
|
||||||
params = {**SEARCH_PARAMS, "q": q, "count": 20}
|
params = {**SEARCH_PARAMS, "q": q, "count": 20}
|
||||||
params["cursor" if cursor else "requestContext"] = cursor if cursor else "launch"
|
params["cursor" if cursor else "requestContext"] = cursor if cursor else "launch"
|
||||||
|
|
||||||
rep = await client.get(SEARCH_URL, params=params)
|
rep = await client.get(SEARCH_URL, params=params)
|
||||||
self.account.check_limits(rep, cursor)
|
rep.raise_for_status()
|
||||||
raise_for_status(rep, "search")
|
|
||||||
|
|
||||||
data = rep.json()
|
data = rep.json()
|
||||||
tweets = data.get("globalObjects", {}).get("tweets", [])
|
|
||||||
cursor = self.get_next_cursor(data)
|
cursor = self.get_next_cursor(data)
|
||||||
# logger.debug(rep.text)
|
tweets = data.get("globalObjects", {}).get("tweets", [])
|
||||||
|
|
||||||
total_count += len(tweets)
|
|
||||||
self._log(rep, f"{total_count:,d} (+{len(tweets):,d}) tweets - {q}")
|
|
||||||
|
|
||||||
if not tweets or not cursor:
|
if not tweets or not cursor:
|
||||||
|
is_tweets = len(tweets) > 0
|
||||||
|
is_cursor = cursor is not None
|
||||||
|
logger.debug(f"{q} - no more results [res: {is_tweets}, cur: {is_cursor}]")
|
||||||
return
|
return
|
||||||
|
|
||||||
yield rep.text
|
yield rep, data, cursor
|
||||||
|
|
||||||
|
async def query(self, q: str):
|
||||||
|
total_count = 0
|
||||||
|
async for x in self.pool.execute("search", lambda c, cur: self.get(c, q, cur)):
|
||||||
|
rep, data, cursor = x
|
||||||
|
|
||||||
|
tweets = data.get("globalObjects", {}).get("tweets", [])
|
||||||
|
total_count += len(tweets)
|
||||||
|
logger.debug(f"{q} - {total_count:,d} (+{len(tweets):,d}) {rep_info(rep)}")
|
||||||
|
|
||||||
|
yield rep
|
||||||
|
|||||||
Загрузка…
x
Ссылка в новой задаче
Block a user