зеркало из
https://github.com/viginum-datalab/twscrape.git
synced 2025-10-29 05:04:22 +02:00
fix typing; update gql endpoints
Этот коммит содержится в:
родитель
7b62efb38a
Коммит
61d159c86d
2
Makefile
2
Makefile
@ -29,7 +29,7 @@ pylint:
|
||||
test:
|
||||
@pytest -s --cov=twscrape tests/
|
||||
|
||||
show-cov:
|
||||
test-cov:
|
||||
@pytest -s --cov=twscrape tests/
|
||||
@coverage html
|
||||
@open htmlcov/index.html
|
||||
|
||||
@ -1,11 +1,11 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
import httpx
|
||||
from fake_useragent import UserAgent
|
||||
|
||||
# note: update this url on next run
|
||||
# url = "https://abs.twimg.com/responsive-web/client-web/api.f4ff3bfa.js"
|
||||
# url = "https://abs.twimg.com/responsive-web/client-web/api.bb81931a.js"
|
||||
url = "https://abs.twimg.com/responsive-web/client-web/main.45d48c6a.js"
|
||||
client = httpx.Client(headers={"user-agent": UserAgent().chrome})
|
||||
|
||||
ops = """
|
||||
SearchTimeline
|
||||
@ -23,13 +23,72 @@ ListLatestTweetsTimeline
|
||||
|
||||
ops = [op.strip() for op in ops.split("\n") if op.strip()]
|
||||
|
||||
script: str = httpx.get(url).text
|
||||
pairs = re.findall(r'queryId:"(.+?)".+?operationName:"(.+?)"', script)
|
||||
|
||||
def script_url(k: str, v: str):
|
||||
return f"https://abs.twimg.com/responsive-web/client-web/{k}.{v}.js"
|
||||
|
||||
|
||||
def get_scripts():
|
||||
cache_dir = "/tmp/twscrape-ops"
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
|
||||
rep = client.get("https://twitter.com/elonmusk")
|
||||
rep.raise_for_status()
|
||||
urls = []
|
||||
|
||||
scripts = rep.text.split('e=>e+"."+')[1].split('[e]+"a.js"')[0]
|
||||
try:
|
||||
for k, v in json.loads(scripts).items():
|
||||
urls.append(script_url(k, f"{v}a"))
|
||||
except json.decoder.JSONDecodeError as e:
|
||||
print(scripts)
|
||||
print(e)
|
||||
exit(1)
|
||||
|
||||
v = rep.text.split("/client-web/main.")[1].split(".")[0]
|
||||
urls.append(script_url("main", v))
|
||||
|
||||
urls = [
|
||||
x
|
||||
for x in urls
|
||||
if "/i18n/" not in x and "/icons/" not in x and "react-syntax-highlighter" not in x
|
||||
]
|
||||
|
||||
scripts = []
|
||||
for i, x in enumerate(urls, 1):
|
||||
cache_path = os.path.join(cache_dir, x.split("/")[-1].split("?")[0])
|
||||
if os.path.exists(cache_path):
|
||||
with open(cache_path) as fp:
|
||||
scripts.append(fp.read())
|
||||
continue
|
||||
|
||||
print(f"({i:3d} / {len(urls):3d}) {x}")
|
||||
rep = client.get(x)
|
||||
rep.raise_for_status()
|
||||
|
||||
with open(cache_path, "w") as fp:
|
||||
fp.write(rep.text)
|
||||
scripts.append(rep.text)
|
||||
|
||||
return scripts
|
||||
|
||||
|
||||
all_pairs = {}
|
||||
for txt in get_scripts():
|
||||
pairs = re.findall(r'queryId:"(.+?)".+?operationName:"(.+?)"', txt)
|
||||
pairs = {op_name: op_id for op_id, op_name in pairs}
|
||||
|
||||
for x in ops:
|
||||
print(f'OP_{x} = "{pairs.get(x, "???")}/{x}"')
|
||||
for k, v in pairs.items():
|
||||
if k in all_pairs and v != all_pairs[k]:
|
||||
print(f"DIFF: {k} = {v} != {all_pairs[k]}")
|
||||
|
||||
# for ??? check urls:
|
||||
# https://twitter.com/SpaceX/status/1719132541632864696/likes
|
||||
# https://twitter.com/i/lists/1494877848087187461
|
||||
all_pairs[k] = v
|
||||
|
||||
|
||||
for k, v in all_pairs.items():
|
||||
print(f'OP_{k} = "{v}/{k}"')
|
||||
|
||||
print("-" * 40)
|
||||
|
||||
for x in ops:
|
||||
print(f'OP_{x} = "{all_pairs.get(x, "???")}/{x}"')
|
||||
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -48,16 +48,16 @@
|
||||
}
|
||||
},
|
||||
"fast_followers_count": 0,
|
||||
"favourites_count": 2077,
|
||||
"followers_count": 596281,
|
||||
"friends_count": 1913,
|
||||
"favourites_count": 2075,
|
||||
"followers_count": 600385,
|
||||
"friends_count": 1800,
|
||||
"has_custom_timelines": true,
|
||||
"is_translator": false,
|
||||
"listed_count": 2516,
|
||||
"listed_count": 2551,
|
||||
"location": "127.0.0.1",
|
||||
"media_count": 815,
|
||||
"name": "Developers",
|
||||
"normal_followers_count": 596281,
|
||||
"normal_followers_count": 600385,
|
||||
"pinned_tweet_ids_str": [
|
||||
"1661790253886177280"
|
||||
],
|
||||
|
||||
@ -48,16 +48,16 @@
|
||||
}
|
||||
},
|
||||
"fast_followers_count": 0,
|
||||
"favourites_count": 2077,
|
||||
"followers_count": 596281,
|
||||
"friends_count": 1913,
|
||||
"favourites_count": 2075,
|
||||
"followers_count": 600385,
|
||||
"friends_count": 1800,
|
||||
"has_custom_timelines": true,
|
||||
"is_translator": false,
|
||||
"listed_count": 2516,
|
||||
"listed_count": 2551,
|
||||
"location": "127.0.0.1",
|
||||
"media_count": 815,
|
||||
"name": "Developers",
|
||||
"normal_followers_count": 596281,
|
||||
"normal_followers_count": 600385,
|
||||
"pinned_tweet_ids_str": [
|
||||
"1661790253886177280"
|
||||
],
|
||||
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
@ -4,16 +4,12 @@ from httpx import Response
|
||||
from .accounts_pool import AccountsPool
|
||||
from .constants import * # noqa: F403
|
||||
from .logger import set_log_level
|
||||
from .models import parse_tweet, parse_tweets, parse_user, parse_users
|
||||
from .models import Tweet, User, parse_tweet, parse_tweets, parse_user, parse_users
|
||||
from .queue_client import QueueClient
|
||||
from .utils import encode_params, find_obj, get_by_path
|
||||
|
||||
# Note: kv is variables, ft is features from original GQL request
|
||||
|
||||
SEARCH_FEATURES = {
|
||||
"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": True,
|
||||
}
|
||||
|
||||
|
||||
class API:
|
||||
pool: AccountsPool
|
||||
@ -50,25 +46,27 @@ class API:
|
||||
# gql helpers
|
||||
|
||||
async def _gql_items(self, op: str, kv: dict, ft: dict | None = None, limit=-1):
|
||||
queue, cursor, count, active = op.split("/")[-1], None, 0, True
|
||||
queue, cur, cnt, active = op.split("/")[-1], None, 0, True
|
||||
kv, ft = {**kv}, {**GQL_FEATURES, **(ft or {})}
|
||||
|
||||
async with QueueClient(self.pool, queue, self.debug) as client:
|
||||
while active:
|
||||
params = {"variables": kv, "features": ft}
|
||||
if cursor is not None:
|
||||
params["variables"]["cursor"] = cursor
|
||||
if cur is not None:
|
||||
params["variables"]["cursor"] = cur
|
||||
if queue in ("SearchTimeline", "ListLatestTweetsTimeline"):
|
||||
params["fieldToggles"] = {"withArticleRichContentState": False}
|
||||
|
||||
rep = await client.get(f"{GQL_URL}/{op}", params=encode_params(params))
|
||||
if rep is None:
|
||||
return
|
||||
|
||||
obj = rep.json()
|
||||
els = get_by_path(obj, "entries") or []
|
||||
els = [x for x in els if not x["entryId"].startswith("cursor-")]
|
||||
cur = self._get_cursor(obj)
|
||||
|
||||
entries = get_by_path(obj, "entries") or []
|
||||
entries = [x for x in entries if not x["entryId"].startswith("cursor-")]
|
||||
cursor = self._get_cursor(obj)
|
||||
|
||||
rep, count, active = self._is_end(rep, queue, entries, cursor, count, limit)
|
||||
rep, cnt, active = self._is_end(rep, queue, els, cur, cnt, limit)
|
||||
if rep is None:
|
||||
return
|
||||
|
||||
@ -92,7 +90,7 @@ class API:
|
||||
"querySource": "typed_query",
|
||||
**(kv or {}),
|
||||
}
|
||||
async for x in self._gql_items(op, kv, ft=SEARCH_FEATURES, limit=limit):
|
||||
async for x in self._gql_items(op, kv, limit=limit):
|
||||
yield x
|
||||
|
||||
async def search(self, q: str, limit=-1, kv=None):
|
||||
@ -110,12 +108,13 @@ class API:
|
||||
"highlights_tweets_tab_ui_enabled": True,
|
||||
"creator_subscriptions_tweet_preview_api_enabled": True,
|
||||
"hidden_profile_subscriptions_enabled": True,
|
||||
"responsive_web_twitter_article_notes_tab_enabled": False,
|
||||
}
|
||||
return await self._gql_item(op, kv, ft)
|
||||
|
||||
async def user_by_id(self, uid: int, kv=None):
|
||||
async def user_by_id(self, uid: int, kv=None) -> User | None:
|
||||
rep = await self.user_by_id_raw(uid, kv=kv)
|
||||
return parse_user(rep)
|
||||
return parse_user(rep) if rep else None
|
||||
|
||||
# user_by_login
|
||||
|
||||
@ -126,15 +125,16 @@ class API:
|
||||
"highlights_tweets_tab_ui_enabled": True,
|
||||
"hidden_profile_likes_enabled": True,
|
||||
"creator_subscriptions_tweet_preview_api_enabled": True,
|
||||
"subscriptions_verification_info_verified_since_enabled": True,
|
||||
"hidden_profile_subscriptions_enabled": True,
|
||||
"subscriptions_verification_info_verified_since_enabled": True,
|
||||
"subscriptions_verification_info_is_identity_verified_enabled": False,
|
||||
"responsive_web_twitter_article_notes_tab_enabled": False,
|
||||
}
|
||||
return await self._gql_item(op, kv, ft)
|
||||
|
||||
async def user_by_login(self, login: str, kv=None):
|
||||
async def user_by_login(self, login: str, kv=None) -> User | None:
|
||||
rep = await self.user_by_login_raw(login, kv=kv)
|
||||
return parse_user(rep)
|
||||
return parse_user(rep) if rep else None
|
||||
|
||||
# tweet_details
|
||||
|
||||
@ -157,23 +157,19 @@ class API:
|
||||
"withSuperFollowsUserFields": False,
|
||||
**(kv or {}),
|
||||
}
|
||||
ft = {
|
||||
"responsive_web_twitter_blue_verified_badge_is_enabled": True,
|
||||
"longform_notetweets_richtext_consumption_enabled": True,
|
||||
**SEARCH_FEATURES,
|
||||
}
|
||||
return await self._gql_item(op, kv, ft)
|
||||
return await self._gql_item(op, kv)
|
||||
|
||||
async def tweet_details(self, twid: int, kv=None):
|
||||
async def tweet_details(self, twid: int, kv=None) -> Tweet | None:
|
||||
rep = await self.tweet_details_raw(twid, kv=kv)
|
||||
return parse_tweet(rep, twid)
|
||||
return parse_tweet(rep, twid) if rep else None
|
||||
|
||||
# followers
|
||||
|
||||
async def followers_raw(self, uid: int, limit=-1, kv=None):
|
||||
op = OP_Followers
|
||||
kv = {"userId": str(uid), "count": 20, "includePromotedContent": False, **(kv or {})}
|
||||
async for x in self._gql_items(op, kv, limit=limit):
|
||||
ft = {"responsive_web_twitter_article_notes_tab_enabled": False}
|
||||
async for x in self._gql_items(op, kv, limit=limit, ft=ft):
|
||||
yield x
|
||||
|
||||
async def followers(self, uid: int, limit=-1, kv=None):
|
||||
@ -266,12 +262,8 @@ class API:
|
||||
|
||||
async def list_timeline_raw(self, list_id: int, limit=-1, kv=None):
|
||||
op = OP_ListLatestTweetsTimeline
|
||||
kv = {
|
||||
"listId": str(list_id),
|
||||
"count": 20,
|
||||
**(kv or {}),
|
||||
}
|
||||
async for x in self._gql_items(op, kv, ft=SEARCH_FEATURES, limit=limit):
|
||||
kv = {"listId": str(list_id), "count": 20, **(kv or {})}
|
||||
async for x in self._gql_items(op, kv, limit=limit):
|
||||
yield x
|
||||
|
||||
async def list_timeline(self, list_id: int, limit=-1, kv=None):
|
||||
|
||||
@ -4,26 +4,27 @@ GQL_URL = "https://twitter.com/i/api/graphql"
|
||||
LOGIN_URL = "https://api.twitter.com/1.1/onboarding/task.json"
|
||||
|
||||
|
||||
OP_SearchTimeline = "lZ0GCEojmtQfiUQa5oJSEw/SearchTimeline"
|
||||
OP_UserByRestId = "QdS5LJDl99iL_KUzckdfNQ/UserByRestId"
|
||||
OP_UserByScreenName = "G3KGOASz96M-Qu0nwmGXNg/UserByScreenName"
|
||||
OP_TweetDetail = "BbmLpxKh8rX8LNe2LhVujA/TweetDetail"
|
||||
OP_Followers = "9LlZicVr2IBf4u2qW5n4-A/Followers"
|
||||
OP_Following = "8cyc0OKedV_XD62fBjzxUw/Following"
|
||||
OP_Retweeters = "Y2XHDEKtlJDA_ql2G3OZZQ/Retweeters"
|
||||
OP_Favoriters = "zXD9lMy1-V_N1OcON9JtEQ/Favoriters"
|
||||
OP_UserTweets = "VgitpdpNZ-RUIp5D1Z_D-A/UserTweets"
|
||||
OP_UserTweetsAndReplies = "YlkSUg0mRBx7-EkxCvc-bw/UserTweetsAndReplies"
|
||||
OP_ListLatestTweetsTimeline = "d1mUZHaqFMxe0xHI3rVc-w/ListLatestTweetsTimeline"
|
||||
OP_SearchTimeline = "Aj1nGkALq99Xg3XI0OZBtw/SearchTimeline"
|
||||
OP_UserByRestId = "CO4_gU4G_MRREoqfiTh6Hg/UserByRestId"
|
||||
OP_UserByScreenName = "NimuplG1OB7Fd2btCLdBOw/UserByScreenName"
|
||||
OP_TweetDetail = "-H4B_lJDEA-O_7_qWaRiyg/TweetDetail"
|
||||
OP_Followers = "3_7xfjmh897x8h_n6QBqTA/Followers"
|
||||
OP_Following = "0yD6Eiv23DKXRDU9VxlG2A/Following"
|
||||
OP_Retweeters = "sOBhVzDeJl4XGepvi5pHlg/Retweeters"
|
||||
OP_Favoriters = "E-ZTxvWWIkmOKwYdNTEefg/Favoriters"
|
||||
OP_UserTweets = "V1ze5q3ijDS1VeLwLY0m7g/UserTweets"
|
||||
OP_UserTweetsAndReplies = "16nOjYqEdV04vN6-rgg8KA/UserTweetsAndReplies"
|
||||
OP_ListLatestTweetsTimeline = "whF0_KH1fCkdLLoyNPMoEw/ListLatestTweetsTimeline"
|
||||
|
||||
# search values here (view source) https://twitter.com/
|
||||
GQL_FEATURES = {
|
||||
"blue_business_profile_image_shape_enabled": True,
|
||||
# "blue_business_profile_image_shape_enabled": True,
|
||||
"responsive_web_graphql_exclude_directive_enabled": True,
|
||||
"verified_phone_label_enabled": False,
|
||||
"responsive_web_graphql_skip_user_profile_image_extensions_enabled": False,
|
||||
"responsive_web_graphql_timeline_navigation_enabled": True,
|
||||
"tweetypie_unmention_optimization_enabled": True,
|
||||
"vibe_api_enabled": True,
|
||||
# "vibe_api_enabled": True,
|
||||
"responsive_web_edit_tweet_api_enabled": True,
|
||||
"graphql_is_translatable_rweb_tweet_is_translatable_enabled": True,
|
||||
"view_counts_everywhere_api_enabled": True,
|
||||
@ -31,16 +32,17 @@ GQL_FEATURES = {
|
||||
"tweet_awards_web_tipping_enabled": False,
|
||||
"freedom_of_speech_not_reach_fetch_enabled": True,
|
||||
"standardized_nudges_misinfo": True,
|
||||
"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": False,
|
||||
"interactive_text_enabled": True,
|
||||
"responsive_web_text_conversations_enabled": False,
|
||||
"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": True,
|
||||
# "interactive_text_enabled": True,
|
||||
# "responsive_web_text_conversations_enabled": False,
|
||||
"longform_notetweets_rich_text_read_enabled": True,
|
||||
"responsive_web_enhance_cards_enabled": False,
|
||||
"creator_subscriptions_tweet_preview_api_enabled": True,
|
||||
"longform_notetweets_inline_media_enabled": True,
|
||||
"responsive_web_media_download_video_enabled": False,
|
||||
"rweb_lists_timeline_redesign_enabled": True,
|
||||
# "rweb_lists_timeline_redesign_enabled": True,
|
||||
"responsive_web_twitter_article_tweet_consumption_enabled": False,
|
||||
"responsive_web_home_pinned_timelines_enabled": True,
|
||||
# "responsive_web_home_pinned_timelines_enabled": True,
|
||||
"c9s_tweet_anatomy_moderator_badge_enabled": True,
|
||||
"rweb_video_timestamps_enabled": True,
|
||||
}
|
||||
|
||||
@ -133,8 +133,8 @@ class QueueClient:
|
||||
fn(f"{rep.status_code:3d} - {req_id(rep)} - {err_msg}")
|
||||
|
||||
# need to add some features in api.py
|
||||
if err_msg.startswith("The following features cannot be null"):
|
||||
logger.error(f"Invalid request: {err_msg}")
|
||||
if err_msg.startswith("(336) The following features cannot be null"):
|
||||
logger.error(f"Update required: {err_msg}")
|
||||
exit(1)
|
||||
|
||||
# general api rate limit
|
||||
@ -169,11 +169,11 @@ class QueueClient:
|
||||
# Something from twitter side, just ignore it
|
||||
# https://github.com/vladkens/twscrape/pull/95
|
||||
if rep.status_code == 200 and "Authorization" in err_msg:
|
||||
logger.warning(f"Unknown authorization error: {err_msg}")
|
||||
logger.warning(f"Authorization unknown error: {err_msg}")
|
||||
return
|
||||
|
||||
if err_msg != "OK":
|
||||
logger.warning(f"Unknown API error: {err_msg}")
|
||||
logger.warning(f"API unknown error: {err_msg}")
|
||||
return # ignore any other unknown errors
|
||||
|
||||
rep.raise_for_status()
|
||||
|
||||
Загрузка…
x
Ссылка в новой задаче
Block a user