From 2587a62f3e7d4c06cc2a7bdcf04ccda26dc11b84 Mon Sep 17 00:00:00 2001 From: Vlad Pronsky Date: Tue, 9 May 2023 22:23:22 +0300 Subject: [PATCH] fix timezone issue; add _type to models --- Makefile | 7 ++++- readme.md | 9 ++++-- tests/test_parser.py | 6 ++++ tests/test_pool.py | 58 ++++++++++++++++++++++++++++++++++++++- twscrape/account.py | 3 +- twscrape/accounts_pool.py | 8 +++++- twscrape/api.py | 21 +++++++------- twscrape/models.py | 8 ++++-- twscrape/utils.py | 11 +++++++- 9 files changed, 111 insertions(+), 20 deletions(-) diff --git a/Makefile b/Makefile index ca49076..5129e03 100644 --- a/Makefile +++ b/Makefile @@ -8,15 +8,20 @@ build: lint: ruff check twscrape + ruff check tests lint-fix: ruff check --fix twscrape + ruff check --fix tests pylint: pylint --errors-only twscrape test: - pytest --cov=twscrape tests/ + pytest -s --cov=twscrape tests/ + +get-cov: + coverage report -m act: act --container-architecture linux/amd64 diff --git a/readme.md b/readme.md index 4fcebc1..580b705 100644 --- a/readme.md +++ b/readme.md @@ -75,8 +75,13 @@ if __name__ == "__main__": asyncio.run(main()) ``` -You can use `login_all` once in your program to pass the login flow and add the accounts to the database. Re-runs will use the previously activated accounts. +Note on rate limits: +- Search API – 250 requests per account / 15 minites +- GraphQL API – 500 requests per account per operation / 15 minutes ### Models - [Tweet](https://github.com/vladkens/twscrape/blob/main/twscrape/models.py#:~:text=class%20Tweet) -- [User](https://github.com/vladkens/twscrape/blob/main/twscrape/models.py#:~:text=class%20User) \ No newline at end of file +- [User](https://github.com/vladkens/twscrape/blob/main/twscrape/models.py#:~:text=class%20User) + +### Related +- [SNScrape](https://github.com/JustAnotherArchivist/snscrape) – is a scraper for social networking services (SNS) diff --git a/tests/test_parser.py b/tests/test_parser.py index 4e10f6c..3bfd934 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -65,8 +65,14 @@ async def test_search(): obj = doc.dict() assert doc.id == obj["id"] assert doc.user.id == obj["user"]["id"] + assert "url" in obj + assert "_type" in obj + assert obj["_type"] == "snscrape.modules.twitter.Tweet" + assert "url" in obj["user"] + assert "_type" in obj["user"] + assert obj["user"]["_type"] == "snscrape.modules.twitter.User" txt = doc.json() assert isinstance(txt, str) diff --git a/tests/test_pool.py b/tests/test_pool.py index 142d568..729dde5 100644 --- a/tests/test_pool.py +++ b/tests/test_pool.py @@ -2,8 +2,9 @@ import os from twscrape.accounts_pool import AccountsPool from twscrape.db import DB +from twscrape.utils import utc_ts -DB_FILE = "/tmp/twapi_test.db" +DB_FILE = "/tmp/twscrape_test.db" def remove_db(): @@ -105,3 +106,58 @@ async def test_get_for_queue(): # should return None acc = await pool.get_for_queue(Q) assert acc is None + + +async def test_account_unlock(): + remove_db() + pool = AccountsPool(DB_FILE) + Q = "test_queue" + + await pool.add_account("user1", "pass1", "email1", "email_pass1") + await pool.set_active("user1", True) + acc = await pool.get_for_queue(Q) + assert acc is not None + assert acc.locks[Q] is not None + + # should unlock account and make available for queue + await pool.unlock(acc.username, Q) + acc = await pool.get_for_queue(Q) + assert acc is not None + assert acc.locks[Q] is not None + + # should update lock time + end_time = utc_ts() + 60 # + 1 minute + await pool.lock_until(acc.username, Q, end_time) + + acc = await pool.get(acc.username) + assert int(acc.locks[Q].timestamp()) == end_time + + +async def test_get_stats(): + remove_db() + pool = AccountsPool(DB_FILE) + Q = "search" + + # should return empty stats + stats = await pool.stats() + for k, v in stats.items(): + assert v == 0, f"{k} should be 0" + + # should increate total + await pool.add_account("user1", "pass1", "email1", "email_pass1") + stats = await pool.stats() + assert stats["total"] == 1 + assert stats["active"] == 0 + + # should increate active + await pool.set_active("user1", True) + stats = await pool.stats() + assert stats["total"] == 1 + assert stats["active"] == 1 + + # should update queue stats + await pool.get_for_queue(Q) + stats = await pool.stats() + assert stats["total"] == 1 + assert stats["active"] == 1 + assert stats["locked_search"] == 1 diff --git a/twscrape/account.py b/twscrape/account.py index 8cebaa4..f4ff09b 100644 --- a/twscrape/account.py +++ b/twscrape/account.py @@ -6,6 +6,7 @@ from datetime import datetime from httpx import AsyncClient, AsyncHTTPTransport from .constants import TOKEN +from .utils import from_utciso @dataclass @@ -43,7 +44,7 @@ class Account: @staticmethod def from_rs(rs: sqlite3.Row): doc = dict(rs) - doc["locks"] = {k: datetime.fromisoformat(v) for k, v in json.loads(doc["locks"]).items()} + doc["locks"] = {k: from_utciso(v) for k, v in json.loads(doc["locks"]).items()} doc["headers"] = json.loads(doc["headers"]) doc["cookies"] = json.loads(doc["cookies"]) doc["active"] = bool(doc["active"]) diff --git a/twscrape/accounts_pool.py b/twscrape/accounts_pool.py index 816c002..a2a550b 100644 --- a/twscrape/accounts_pool.py +++ b/twscrape/accounts_pool.py @@ -88,7 +88,6 @@ class AccountsPool: await execute(self._db_file, qs, {"username": username, "active": active}) async def lock_until(self, username: str, queue: str, unlock_at: int): - # unlock_at is unix timestamp qs = f""" UPDATE accounts SET locks = json_set(locks, '$.{queue}', datetime({unlock_at}, 'unixepoch')) WHERE username = :username @@ -141,11 +140,18 @@ class AccountsPool: WHERE json_extract(locks, '$.{queue}') IS NOT NULL AND json_extract(locks, '$.{queue}') > datetime('now') """ + gql_ops = """ + UserByRestId UserByScreenName TweetDetail Followers Following + Retweeters Favoriters UserTweets UserTweetsAndReplies + """ + gql_ops = [x.strip() for x in gql_ops.split(" ") if x.strip()] + config = [ ("total", "SELECT COUNT(*) FROM accounts"), ("active", "SELECT COUNT(*) FROM accounts WHERE active = true"), ("inactive", "SELECT COUNT(*) FROM accounts WHERE active = false"), ("locked_search", by_queue("search")), + *[(f"locked_{x}", by_queue(x)) for x in gql_ops], ] qs = f"SELECT {','.join([f'({q}) as {k}' for k, q in config])}" diff --git a/twscrape/api.py b/twscrape/api.py index 921c876..ef5d26c 100644 --- a/twscrape/api.py +++ b/twscrape/api.py @@ -1,5 +1,4 @@ import json -import time from datetime import datetime from typing import Awaitable, Callable @@ -9,7 +8,7 @@ from .accounts_pool import AccountsPool from .constants import GQL_FEATURES, GQL_URL, SEARCH_PARAMS, SEARCH_URL from .logger import logger from .models import Tweet, User -from .utils import encode_params, find_obj, get_by_path, to_old_obj, to_search_like +from .utils import encode_params, find_obj, get_by_path, to_old_obj, to_old_rep, utc_ts class API: @@ -95,8 +94,8 @@ class API: # possible account banned if rep.status_code == 403: - logger.debug(f"Ban for {log_id}") - reset_ts = int(time.time() + 60 * 60) # 1 hour + logger.warning(f"403 for {log_id}") + reset_ts = utc_ts() + 60 * 60 # + 1 hour await self.pool.lock_until(acc.username, queue, reset_ts) continue @@ -256,7 +255,7 @@ class API: async def tweet_details(self, twid: int): rep = await self.tweet_details_raw(twid) - obj = to_search_like(rep.json()) + obj = to_old_rep(rep.json()) return Tweet.parse(obj["tweets"][str(twid)], obj) # followers @@ -269,7 +268,7 @@ class API: async def followers(self, uid: int, limit=-1): async for rep in self.followers_raw(uid, limit=limit): - obj = to_search_like(rep.json()) + obj = to_old_rep(rep.json()) for _, v in obj["users"].items(): yield User.parse(v) @@ -283,7 +282,7 @@ class API: async def following(self, uid: int, limit=-1): async for rep in self.following_raw(uid, limit=limit): - obj = to_search_like(rep.json()) + obj = to_old_rep(rep.json()) for _, v in obj["users"].items(): yield User.parse(v) @@ -297,7 +296,7 @@ class API: async def retweeters(self, twid: int, limit=-1): async for rep in self.retweeters_raw(twid, limit=limit): - obj = to_search_like(rep.json()) + obj = to_old_rep(rep.json()) for _, v in obj["users"].items(): yield User.parse(v) @@ -311,7 +310,7 @@ class API: async def favoriters(self, twid: int, limit=-1): async for rep in self.favoriters_raw(twid, limit=limit): - obj = to_search_like(rep.json()) + obj = to_old_rep(rep.json()) for _, v in obj["users"].items(): yield User.parse(v) @@ -332,7 +331,7 @@ class API: async def user_tweets(self, uid: int, limit=-1): async for rep in self.user_tweets_raw(uid, limit=limit): - obj = to_search_like(rep.json()) + obj = to_old_rep(rep.json()) for _, v in obj["tweets"].items(): yield Tweet.parse(v, obj) @@ -353,6 +352,6 @@ class API: async def user_tweets_and_replies(self, uid: int, limit=-1): async for rep in self.user_tweets_and_replies_raw(uid, limit=limit): - obj = to_search_like(rep.json()) + obj = to_old_rep(rep.json()) for _, v in obj["tweets"].items(): yield Tweet.parse(v, obj) diff --git a/twscrape/models.py b/twscrape/models.py index 1058292..ead0b78 100644 --- a/twscrape/models.py +++ b/twscrape/models.py @@ -5,7 +5,6 @@ from dataclasses import asdict, dataclass from datetime import datetime from typing import Optional -from .logger import logger from .utils import find_item, get_or, int_or_none @@ -77,6 +76,7 @@ class UserRef(JSONTrait): id: int username: str displayname: str + _type: str = "snscrape.modules.twitter.UserRef" @staticmethod def parse(obj: dict): @@ -102,7 +102,9 @@ class User(JSONTrait): profileBannerUrl: str | None = None protected: bool | None = None verified: bool | None = None + _type: str = "snscrape.modules.twitter.User" + # todo: # descriptionLinks: typing.Optional[typing.List[TextLink]] = None # link: typing.Optional[TextLink] = None # label: typing.Optional["UserLabel"] = None @@ -157,7 +159,9 @@ class Tweet(JSONTrait): source: str | None = None sourceUrl: str | None = None sourceLabel: str | None = None + _type: str = "snscrape.modules.twitter.Tweet" + # todo: # renderedContent: str # media: typing.Optional[typing.List["Medium"]] = None # card: typing.Optional["Card"] = None @@ -211,7 +215,7 @@ def _get_reply_user(tw_obj: dict, res: dict): if mention: return UserRef.parse(mention) - logger.debug(f'{tw_obj["in_reply_to_user_id_str"]}\n{json.dumps(res)}') + # todo: user not found in reply (probably deleted or hidden) return None diff --git a/twscrape/utils.py b/twscrape/utils.py index 5444763..cf33b8b 100644 --- a/twscrape/utils.py +++ b/twscrape/utils.py @@ -1,5 +1,6 @@ import json from collections import defaultdict +from datetime import datetime, timezone from typing import Any, AsyncGenerator, Callable, TypeVar from httpx import HTTPStatusError, Response @@ -123,7 +124,7 @@ def to_old_obj(obj: dict): return {**obj, **obj["legacy"], "id_str": str(obj["rest_id"]), "id": int(obj["rest_id"])} -def to_search_like(obj: dict): +def to_old_rep(obj: dict): tmp = get_typed_object(obj, defaultdict(list)) tweets = [x for x in tmp.get("Tweet", []) if "legacy" in x] @@ -133,3 +134,11 @@ def to_search_like(obj: dict): users = {str(x["rest_id"]): to_old_obj(x) for x in users} return {"tweets": tweets, "users": users} + + +def utc_ts() -> int: + return int(datetime.utcnow().replace(tzinfo=timezone.utc).timestamp()) + + +def from_utciso(iso: str) -> datetime: + return datetime.fromisoformat(iso).replace(tzinfo=timezone.utc)