зеркало из
https://github.com/viginum-datalab/twscrape.git
synced 2025-10-30 13:36:12 +02:00
add graphql api support
Этот коммит содержится в:
родитель
2ed247af0f
Коммит
9744b80a67
@ -20,6 +20,17 @@ class AccountsPool:
|
|||||||
return x
|
return x
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
async def get_account_or_wait(self, queue: str) -> UserClient:
|
||||||
|
while True:
|
||||||
|
account = self.get_account(queue)
|
||||||
|
if account:
|
||||||
|
logger.debug(f"Using account {account.username} for queue '{queue}'")
|
||||||
|
account.lock(queue)
|
||||||
|
return account
|
||||||
|
else:
|
||||||
|
logger.debug(f"No accounts available for queue '{queue}' (sleeping for 5 sec)")
|
||||||
|
await asyncio.sleep(5)
|
||||||
|
|
||||||
async def execute(
|
async def execute(
|
||||||
self,
|
self,
|
||||||
queue: str,
|
queue: str,
|
||||||
@ -29,14 +40,7 @@ class AccountsPool:
|
|||||||
cursor: str | None = None,
|
cursor: str | None = None,
|
||||||
):
|
):
|
||||||
while True:
|
while True:
|
||||||
account = self.get_account(queue)
|
account = await self.get_account_or_wait(queue)
|
||||||
if not account:
|
|
||||||
logger.debug(f"No accounts available for queue {queue}, sleeping 5 seconds")
|
|
||||||
await asyncio.sleep(5)
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
account.lock(queue)
|
|
||||||
logger.debug(f"Using account {account.username} for queue {queue}")
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
client = account.make_client()
|
client = account.make_client()
|
||||||
@ -47,7 +51,7 @@ class AccountsPool:
|
|||||||
except HTTPStatusError as e:
|
except HTTPStatusError as e:
|
||||||
if e.response.status_code == 429:
|
if e.response.status_code == 429:
|
||||||
account.update_limit(queue, e.response)
|
account.update_limit(queue, e.response)
|
||||||
logger.debug(f"Account {account.username} is frozen")
|
logger.debug(f"Rate limit reached for account {account.username}")
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
raise e
|
raise e
|
||||||
|
|||||||
110
twapi/search.py
110
twapi/search.py
@ -1,3 +1,5 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
from httpx import AsyncClient, Response
|
from httpx import AsyncClient, Response
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
@ -41,12 +43,54 @@ include_ext_edit_control=true
|
|||||||
ext=mediaStats%2ChighlightedLabel%2ChasNftAvatar%2CvoiceInfo%2CbirdwatchPivot%2Cenrichments%2CsuperFollowMetadata%2CunmentionInfo%2CeditControl%2Cvibe
|
ext=mediaStats%2ChighlightedLabel%2ChasNftAvatar%2CvoiceInfo%2CbirdwatchPivot%2Cenrichments%2CsuperFollowMetadata%2CunmentionInfo%2CeditControl%2Cvibe
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
BASE_FEATURES = {
|
||||||
|
"blue_business_profile_image_shape_enabled": True,
|
||||||
|
"responsive_web_graphql_exclude_directive_enabled": True,
|
||||||
|
"verified_phone_label_enabled": False,
|
||||||
|
"responsive_web_graphql_skip_user_profile_image_extensions_enabled": False,
|
||||||
|
"responsive_web_graphql_timeline_navigation_enabled": True,
|
||||||
|
#
|
||||||
|
"tweetypie_unmention_optimization_enabled": True,
|
||||||
|
"vibe_api_enabled": True,
|
||||||
|
"responsive_web_edit_tweet_api_enabled": True,
|
||||||
|
"graphql_is_translatable_rweb_tweet_is_translatable_enabled": True,
|
||||||
|
"view_counts_everywhere_api_enabled": True,
|
||||||
|
"longform_notetweets_consumption_enabled": True,
|
||||||
|
"tweet_awards_web_tipping_enabled": False,
|
||||||
|
"freedom_of_speech_not_reach_fetch_enabled": True,
|
||||||
|
"standardized_nudges_misinfo": True,
|
||||||
|
"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": False,
|
||||||
|
"interactive_text_enabled": True,
|
||||||
|
"responsive_web_text_conversations_enabled": False,
|
||||||
|
"longform_notetweets_rich_text_read_enabled": True,
|
||||||
|
"responsive_web_enhance_cards_enabled": False,
|
||||||
|
}
|
||||||
|
|
||||||
SEARCH_URL = "https://api.twitter.com/2/search/adaptive.json"
|
SEARCH_URL = "https://api.twitter.com/2/search/adaptive.json"
|
||||||
SEARCH_PARAMS = dict(x.split("=") for x in BASIC_SEARCH_PARAMS.splitlines() if x)
|
SEARCH_PARAMS = dict(x.split("=") for x in BASIC_SEARCH_PARAMS.splitlines() if x)
|
||||||
|
|
||||||
|
|
||||||
|
def json_params(params: dict):
|
||||||
|
return {k: json.dumps(v, separators=(",", ":")) for k, v in params.items()}
|
||||||
|
|
||||||
|
|
||||||
|
def get_ql_entries(obj: dict) -> list[dict]:
|
||||||
|
try:
|
||||||
|
key = list(obj["data"].keys())[0]
|
||||||
|
return obj["data"][key]["timeline"]["instructions"][0]["entries"]
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def get_ql_cursor(obj: dict) -> str | None:
|
||||||
|
for entry in get_ql_entries(obj):
|
||||||
|
if entry["entryId"].startswith("cursor-bottom-"):
|
||||||
|
return entry["content"]["value"]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def rep_info(rep: Response) -> str:
|
def rep_info(rep: Response) -> str:
|
||||||
return f"[{rep.headers['x-rate-limit-remaining']}/{rep.headers['x-rate-limit-limit']}]"
|
return f"[{rep.status_code} ~ {rep.headers['x-rate-limit-remaining']}/{rep.headers['x-rate-limit-limit']}]"
|
||||||
|
|
||||||
|
|
||||||
class Search:
|
class Search:
|
||||||
@ -67,7 +111,7 @@ class Search:
|
|||||||
logger.debug(e)
|
logger.debug(e)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def get(self, client: AsyncClient, q: str, cursor: str | None):
|
async def get_items(self, client: AsyncClient, q: str, cursor: str | None):
|
||||||
while True:
|
while True:
|
||||||
params = {**SEARCH_PARAMS, "q": q, "count": 20}
|
params = {**SEARCH_PARAMS, "q": q, "count": 20}
|
||||||
params["cursor" if cursor else "requestContext"] = cursor if cursor else "launch"
|
params["cursor" if cursor else "requestContext"] = cursor if cursor else "launch"
|
||||||
@ -79,16 +123,16 @@ class Search:
|
|||||||
cursor = self.get_next_cursor(data)
|
cursor = self.get_next_cursor(data)
|
||||||
tweets = data.get("globalObjects", {}).get("tweets", [])
|
tweets = data.get("globalObjects", {}).get("tweets", [])
|
||||||
if not tweets or not cursor:
|
if not tweets or not cursor:
|
||||||
is_tweets = len(tweets) > 0
|
is_result = len(tweets) > 0
|
||||||
is_cursor = cursor is not None
|
is_cursor = cursor is not None
|
||||||
logger.debug(f"{q} - no more results [res: {is_tweets}, cur: {is_cursor}]")
|
logger.debug(f"{q} - no more items [res={is_result} cur={is_cursor}]")
|
||||||
return
|
return
|
||||||
|
|
||||||
yield rep, data, cursor
|
yield rep, data, cursor
|
||||||
|
|
||||||
async def query(self, q: str):
|
async def search(self, q: str):
|
||||||
total_count = 0
|
total_count = 0
|
||||||
async for x in self.pool.execute("search", lambda c, cur: self.get(c, q, cur)):
|
async for x in self.pool.execute("search", lambda c, cur: self.get_items(c, q, cur)):
|
||||||
rep, data, cursor = x
|
rep, data, cursor = x
|
||||||
|
|
||||||
tweets = data.get("globalObjects", {}).get("tweets", [])
|
tweets = data.get("globalObjects", {}).get("tweets", [])
|
||||||
@ -96,3 +140,57 @@ class Search:
|
|||||||
logger.debug(f"{q} - {total_count:,d} (+{len(tweets):,d}) {rep_info(rep)}")
|
logger.debug(f"{q} - {total_count:,d} (+{len(tweets):,d}) {rep_info(rep)}")
|
||||||
|
|
||||||
yield rep
|
yield rep
|
||||||
|
|
||||||
|
async def graphql_items(self, op: str, variables: dict, features: dict = {}, limit=-1):
|
||||||
|
url = f"https://twitter.com/i/api/graphql/{op}"
|
||||||
|
features = {**BASE_FEATURES, **features}
|
||||||
|
|
||||||
|
cursor, all_count, queue = None, 0, op.split("/")[-1]
|
||||||
|
while True:
|
||||||
|
account = await self.pool.get_account_or_wait(queue)
|
||||||
|
client = account.make_client()
|
||||||
|
|
||||||
|
try:
|
||||||
|
params = {"variables": {**variables, "cursor": cursor}, "features": features}
|
||||||
|
rep = await client.get(url, params=json_params(params))
|
||||||
|
logger.debug(f"{url} {rep_info(rep)}")
|
||||||
|
rep.raise_for_status()
|
||||||
|
|
||||||
|
data = rep.json()
|
||||||
|
entries, cursor = get_ql_entries(data), get_ql_cursor(data)
|
||||||
|
|
||||||
|
# cursor-top / cursor-bottom always present
|
||||||
|
now_count = len([x for x in entries if not x["entryId"].startswith("cursor-")])
|
||||||
|
all_count += now_count
|
||||||
|
|
||||||
|
yield rep
|
||||||
|
|
||||||
|
if not cursor or not now_count or (limit > 0 and all_count >= limit):
|
||||||
|
return
|
||||||
|
finally:
|
||||||
|
account.unlock(queue)
|
||||||
|
|
||||||
|
async def graphql_item(self, op: str, variables: dict, features: dict = {}):
|
||||||
|
res: list[Response] = []
|
||||||
|
async for x in self.graphql_items(op, variables, features):
|
||||||
|
res.append(x)
|
||||||
|
break
|
||||||
|
return res[0]
|
||||||
|
|
||||||
|
async def user_by_login(self, login: str):
|
||||||
|
v = {"screen_name": login, "withSafetyModeUserFields": True}
|
||||||
|
return await self.graphql_item("sLVLhk0bGj3MVFEKTdax1w/UserByScreenName", v)
|
||||||
|
|
||||||
|
async def user_by_id(self, uid: int):
|
||||||
|
v = {"userId": str(uid), "withSafetyModeUserFields": True}
|
||||||
|
return await self.graphql_item("GazOglcBvgLigl3ywt6b3Q/UserByRestId", v)
|
||||||
|
|
||||||
|
async def retweeters(self, twid: int, limit=-1):
|
||||||
|
v = {"tweetId": str(twid), "count": 20, "includePromotedContent": True}
|
||||||
|
async for x in self.graphql_items("U5f_jm0CiLmSfI1d4rGleQ/Retweeters", v, limit=limit):
|
||||||
|
yield x
|
||||||
|
|
||||||
|
async def favoriters(self, twid: int, limit=-1):
|
||||||
|
v = {"tweetId": str(twid), "count": 20, "includePromotedContent": True}
|
||||||
|
async for x in self.graphql_items("vcTrPlh9ovFDQejz22q9vg/Favoriters", v, limit=limit):
|
||||||
|
yield x
|
||||||
|
|||||||
Загрузка…
x
Ссылка в новой задаче
Block a user