twscrape/twapi/search.py

import json
from time import time
from typing import Awaitable, Callable

from httpx import AsyncClient, HTTPStatusError, Response
from loguru import logger

from .models import Tweet, User
from .pool import AccountsPool
from .utils import encode_params, find_item, to_old_obj, to_search_like

BASIC_SEARCH_PARAMS = """
include_profile_interstitial_type=1
include_blocking=1
include_blocked_by=1
include_followed_by=1
include_want_retweets=1
include_mute_edge=1
include_can_dm=1
include_can_media_tag=1
include_ext_has_nft_avatar=1
include_ext_is_blue_verified=1
include_ext_verified_type=1
include_ext_profile_image_shape=1
skip_status=1
cards_platform=Web-12
include_cards=1
include_ext_alt_text=true
include_ext_limited_action_results=false
include_quote_count=true
include_reply_count=1
tweet_mode=extended
include_ext_views=true
include_entities=true
include_user_entities=true
include_ext_media_color=true
include_ext_media_availability=true
include_ext_sensitive_media_warning=true
include_ext_trusted_friends_metadata=true
send_error_codes=true
simple_quoted_tweet=true
tweet_search_mode=live
query_source=recent_search_click
pc=1
spelling_corrections=1
include_ext_edit_control=true
ext=mediaStats%2ChighlightedLabel%2ChasNftAvatar%2CvoiceInfo%2CbirdwatchPivot%2Cenrichments%2CsuperFollowMetadata%2CunmentionInfo%2CeditControl%2Cvibe
"""

BASE_FEATURES = {
    "blue_business_profile_image_shape_enabled": True,
    "responsive_web_graphql_exclude_directive_enabled": True,
    "verified_phone_label_enabled": False,
    "responsive_web_graphql_skip_user_profile_image_extensions_enabled": False,
    "responsive_web_graphql_timeline_navigation_enabled": True,
    #
    "tweetypie_unmention_optimization_enabled": True,
    "vibe_api_enabled": True,
    "responsive_web_edit_tweet_api_enabled": True,
    "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True,
    "view_counts_everywhere_api_enabled": True,
    "longform_notetweets_consumption_enabled": True,
    "tweet_awards_web_tipping_enabled": False,
    "freedom_of_speech_not_reach_fetch_enabled": True,
    "standardized_nudges_misinfo": True,
    "tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": False,
    "interactive_text_enabled": True,
    "responsive_web_text_conversations_enabled": False,
    "longform_notetweets_rich_text_read_enabled": True,
    "responsive_web_enhance_cards_enabled": False,
}

SEARCH_URL = "https://api.twitter.com/2/search/adaptive.json"
SEARCH_PARAMS = dict(x.split("=") for x in BASIC_SEARCH_PARAMS.splitlines() if x)
GRAPHQL_URL = "https://twitter.com/i/api/graphql/"


def filter_null(obj: dict):
    try:
        return {k: v for k, v in obj.items() if v is not None}
    except AttributeError:
        return obj


def json_params(obj: dict):
    return {k: json.dumps(filter_null(v), separators=(",", ":")) for k, v in obj.items()}


def get_ql_entries(obj: dict) -> list[dict]:
    entries = find_item(obj, "entries")
    return entries or []


class Search:
    def __init__(self, pool: AccountsPool):
        self.pool = pool

        # http helpers

    def _limit_msg(self, rep: Response):
        lr = rep.headers.get("x-rate-limit-remaining", -1)
        ll = rep.headers.get("x-rate-limit-limit", -1)
        return f"{lr}/{ll}"

    def _is_end(self, rep: Response, q: str, res: list, cur: str | None, cnt: int, lim: int):
        new_count = len(res)
        new_total = cnt + new_count

        is_res = new_count > 0
        is_cur = cur is not None
        is_lim = lim > 0 and new_total >= lim

        stats = f"{q} {new_total:,d} (+{new_count:,d})"
        flags = f"res={int(is_res)} cur={int(is_cur)} lim={int(is_lim)}"
        logger.debug(" ".join([stats, flags, self._limit_msg(rep)]))

        return new_total, not is_res, not is_cur or is_lim

    async def _inf_req(self, queue: str, cb: Callable[[AsyncClient], Awaitable[Response]]):
        while True:
            account = await self.pool.get_account_or_wait(queue)

            try:
                while True:
                    rep = await cb(account.client)
                    rep.raise_for_status()
                    yield rep
            except HTTPStatusError as e:
                if e.response.status_code == 429:
                    logger.debug(f"Rate limit for account={account.username} on queue={queue}")
                    reset_ts = int(e.response.headers.get("x-rate-limit-reset", 0))
                    account.update_limit(queue, reset_ts)
                    continue

                if e.response.status_code == 403:
                    logger.debug(f"Account={account.username} is banned on queue={queue}")
                    reset_ts = int(time.time() + 60 * 60)  # 1 hour
                    account.update_limit(queue, reset_ts)
                    continue

                logger.error(f"[{e.response.status_code}] {e.request.url}\n{e.response.text}")
                raise e
            finally:
                account.unlock(queue)

    def _get_search_cursor(self, res: dict) -> str | None:
        try:
            for x in res["timeline"]["instructions"]:
                entry = x.get("replaceEntry", None)
                if entry is not None and entry["entryIdToReplace"] == "sq-cursor-bottom":
                    return entry["entry"]["content"]["operation"]["cursor"]["value"]

                for entry in x.get("addEntries", {}).get("entries", []):
                    if entry["entryId"] == "sq-cursor-bottom":
                        return entry["content"]["operation"]["cursor"]["value"]
        except Exception as e:
            logger.debug(e)
            return None

    def get_ql_entries(self, obj: dict) -> list[dict]:
        entries = find_item(obj, "entries")
        return entries or []

    def _get_ql_cursor(self, obj: dict) -> str | None:
        try:
            for entry in self.get_ql_entries(obj):
                if entry["entryId"].startswith("cursor-bottom-"):
                    return entry["content"]["value"]
            return None
        except Exception:
            return None

    async def _ql_items(self, op: str, kv: dict, ft: dict = {}, limit=-1):
        queue, cursor, count = op.split("/")[-1], None, 0

        async def _get(client: AsyncClient):
            params = {"variables": {**kv, "cursor": cursor}, "features": BASE_FEATURES}
            return await client.get(f"{GRAPHQL_URL}/{op}", params=encode_params(params))

        async for rep in self._inf_req(queue, _get):
            obj = rep.json()

            # cursor-top / cursor-bottom always present
            entries = self.get_ql_entries(obj)
            entries = [x for x in entries if not x["entryId"].startswith("cursor-")]
            cursor = self._get_ql_cursor(obj)

            check = self._is_end(rep, queue, entries, cursor, count, limit)
            count, end_before, end_after = check

            if end_before:
                return

            yield rep

            if end_after:
                return

    async def _ql_item(self, op: str, kv: dict, ft: dict = {}):
        variables, features = {**kv}, {**BASE_FEATURES, **ft}
        params = {"variables": variables, "features": features}

        async def _get(client: AsyncClient):
            return await client.get(f"{GRAPHQL_URL}/{op}", params=encode_params(params))

        queue = op.split("/")[-1]
        async for rep in self._inf_req(queue, _get):
            logger.debug(f"{queue} {self._limit_msg(rep)}")
            return rep

        raise Exception("No response")  # todo

    # search

    async def search_raw(self, q: str, limit=-1):
        queue, cursor, count = "search", None, 0

        async def _get(client: AsyncClient):
            params = {**SEARCH_PARAMS, "q": q, "count": 20}
            params["cursor" if cursor else "requestContext"] = cursor if cursor else "launch"
            return await client.get(SEARCH_URL, params=params)

        async for rep in self._inf_req(queue, _get):
            data = rep.json()

            cursor = self._get_search_cursor(data)
            tweets = data.get("globalObjects", {}).get("tweets", [])

            check = self._is_end(rep, q, tweets, cursor, count, limit)
            count, end_before, end_after = check

            if end_before:
                return

            yield rep

            if end_after:
                return

    async def search(self, q: str, limit=-1):
        async for rep in self.search_raw(q, limit=limit):
            res = rep.json()
            obj = res.get("globalObjects", {})
            for x in list(obj.get("tweets", {}).values()):
                yield Tweet.parse(x, obj)

    # user_by_id

    async def user_by_id_raw(self, uid: int):
        op = "GazOglcBvgLigl3ywt6b3Q/UserByRestId"
        kv = {"userId": str(uid), "withSafetyModeUserFields": True}
        return await self._ql_item(op, kv)

    async def user_by_id(self, uid: int):
        rep = await self.user_by_id_raw(uid)
        res = rep.json()
        return User.parse(to_old_obj(res["data"]["user"]["result"]))

    # user_by_login

    async def user_by_login_raw(self, login: str):
        op = "sLVLhk0bGj3MVFEKTdax1w/UserByScreenName"
        kv = {"screen_name": login, "withSafetyModeUserFields": True}
        return await self._ql_item(op, kv)

    async def user_by_login(self, login: str):
        rep = await self.user_by_login_raw(login)
        res = rep.json()
        return User.parse(to_old_obj(res["data"]["user"]["result"]))

    # tweet_details

    async def tweet_details_raw(self, twid: int):
        op = "zXaXQgfyR4GxE21uwYQSyA/TweetDetail"
        kv = {
            "focalTweetId": str(twid),
            "referrer": "tweet",  # tweet, profile
            "with_rux_injections": False,
            "includePromotedContent": True,
            "withCommunity": True,
            "withQuickPromoteEligibilityTweetFields": True,
            "withBirdwatchNotes": True,
            "withVoice": True,
            "withV2Timeline": True,
            "withDownvotePerspective": False,
            "withReactionsMetadata": False,
            "withReactionsPerspective": False,
            "withSuperFollowsTweetFields": False,
            "withSuperFollowsUserFields": False,
        }
        ft = {
            "responsive_web_twitter_blue_verified_badge_is_enabled": True,
            "longform_notetweets_richtext_consumption_enabled": True,
        }
        return await self._ql_item(op, kv, ft)

    async def tweet_details(self, twid: int):
        rep = await self.tweet_details_raw(twid)
        obj = to_search_like(rep.json())
        return Tweet.parse(obj["tweets"][str(twid)], obj)

    # followers

    async def followers_raw(self, uid: int, limit=-1):
        op = "djdTXDIk2qhd4OStqlUFeQ/Followers"
        kv = {"userId": str(uid), "count": 20, "includePromotedContent": False}
        async for x in self._ql_items(op, kv, limit=limit):
            yield x

    async def followers(self, uid: int, limit=-1):
        async for rep in self.followers_raw(uid, limit=limit):
            obj = to_search_like(rep.json())
            for _, v in obj["users"].items():
                yield User.parse(v)

    # following

    async def following_raw(self, uid: int, limit=-1):
        op = "IWP6Zt14sARO29lJT35bBw/Following"
        kv = {"userId": str(uid), "count": 20, "includePromotedContent": False}
        async for x in self._ql_items(op, kv, limit=limit):
            yield x

    async def following(self, uid: int, limit=-1):
        async for rep in self.following_raw(uid, limit=limit):
            obj = to_search_like(rep.json())
            for _, v in obj["users"].items():
                yield User.parse(v)

    # retweeters

    async def retweeters_raw(self, twid: int, limit=-1):
        op = "U5f_jm0CiLmSfI1d4rGleQ/Retweeters"
        kv = {"tweetId": str(twid), "count": 20, "includePromotedContent": True}
        async for x in self._ql_items(op, kv, limit=limit):
            yield x

    async def retweeters(self, twid: int, limit=-1):
        async for rep in self.retweeters_raw(twid, limit=limit):
            obj = to_search_like(rep.json())
            for _, v in obj["users"].items():
                yield User.parse(v)

    # favoriters

    async def favoriters_raw(self, twid: int, limit=-1):
        op = "vcTrPlh9ovFDQejz22q9vg/Favoriters"
        kv = {"tweetId": str(twid), "count": 20, "includePromotedContent": True}
        async for x in self._ql_items(op, kv, limit=limit):
            yield x

    async def favoriters(self, twid: int, limit=-1):
        async for rep in self.favoriters_raw(twid, limit=limit):
            obj = to_search_like(rep.json())
            for _, v in obj["users"].items():
                yield User.parse(v)

    # user_tweets

    async def user_tweets_raw(self, uid: int, limit=-1):
        op = "CdG2Vuc1v6F5JyEngGpxVw/UserTweets"
        kv = {
            "userId": str(uid),
            "count": 40,
            "includePromotedContent": True,
            "withQuickPromoteEligibilityTweetFields": True,
            "withVoice": True,
            "withV2Timeline": True,
        }
        async for x in self._ql_items(op, kv, limit=limit):
            yield x

    async def user_tweets(self, uid: int, limit=-1):
        async for rep in self.user_tweets_raw(uid, limit=limit):
            obj = to_search_like(rep.json())
            for _, v in obj["tweets"].items():
                yield Tweet.parse(v, obj)

    # user_tweets_and_replies

    async def user_tweets_and_replies_raw(self, uid: int, limit=-1):
        op = "zQxfEr5IFxQ2QZ-XMJlKew/UserTweetsAndReplies"
        kv = {
            "userId": str(uid),
            "count": 40,
            "includePromotedContent": True,
            "withCommunity": True,
            "withVoice": True,
            "withV2Timeline": True,
        }
        async for x in self._ql_items(op, kv, limit=limit):
            yield x

    async def user_tweets_and_replies(self, uid: int, limit=-1):
        async for rep in self.user_tweets_and_replies_raw(uid, limit=limit):
            obj = to_search_like(rep.json())
            for _, v in obj["tweets"].items():
                yield Tweet.parse(v, obj)