twscrape/_get_gql_ops.py

import json
import os
import re

import httpx
from fake_useragent import UserAgent

client = httpx.Client(headers={"user-agent": UserAgent().chrome})

ops = """
SearchTimeline
UserByRestId
UserByScreenName
TweetDetail
Followers
Following
Retweeters
Favoriters
UserTweets
UserTweetsAndReplies
ListLatestTweetsTimeline
"""

ops = [op.strip() for op in ops.split("\n") if op.strip()]


def script_url(k: str, v: str):
    return f"https://abs.twimg.com/responsive-web/client-web/{k}.{v}.js"


def get_scripts():
    cache_dir = "/tmp/twscrape-ops"
    os.makedirs(cache_dir, exist_ok=True)

    rep = client.get("https://twitter.com/elonmusk")
    rep.raise_for_status()
    urls = []

    scripts = rep.text.split('e=>e+"."+')[1].split('[e]+"a.js"')[0]
    try:
        for k, v in json.loads(scripts).items():
            urls.append(script_url(k, f"{v}a"))
    except json.decoder.JSONDecodeError as e:
        print(scripts)
        print(e)
        exit(1)

    v = rep.text.split("/client-web/main.")[1].split(".")[0]
    urls.append(script_url("main", v))

    urls = [
        x
        for x in urls
        if "/i18n/" not in x and "/icons/" not in x and "react-syntax-highlighter" not in x
    ]

    scripts = []
    for i, x in enumerate(urls, 1):
        cache_path = os.path.join(cache_dir, x.split("/")[-1].split("?")[0])
        if os.path.exists(cache_path):
            with open(cache_path) as fp:
                scripts.append(fp.read())
            continue

        print(f"({i:3d} / {len(urls):3d}) {x}")
        rep = client.get(x)
        rep.raise_for_status()

        with open(cache_path, "w") as fp:
            fp.write(rep.text)
        scripts.append(rep.text)

    return scripts


all_pairs = {}
for txt in get_scripts():
    pairs = re.findall(r'queryId:"(.+?)".+?operationName:"(.+?)"', txt)
    pairs = {op_name: op_id for op_id, op_name in pairs}

    for k, v in pairs.items():
        if k in all_pairs and v != all_pairs[k]:
            print(f"DIFF: {k} = {v} != {all_pairs[k]}")

        all_pairs[k] = v


for k, v in all_pairs.items():
    print(f'OP_{k} = "{v}/{k}"')

print("-" * 40)

for x in ops:
    print(f'OP_{x} = "{all_pairs.get(x, "???")}/{x}"')