fix timezone issue; add _type to models

Этот коммит содержится в:
Vlad Pronsky 2023-05-09 22:23:22 +03:00
родитель 799f6c330a
Коммит 2587a62f3e
9 изменённых файлов: 111 добавлений и 20 удалений

Просмотреть файл

@ -8,15 +8,20 @@ build:
lint: lint:
ruff check twscrape ruff check twscrape
ruff check tests
lint-fix: lint-fix:
ruff check --fix twscrape ruff check --fix twscrape
ruff check --fix tests
pylint: pylint:
pylint --errors-only twscrape pylint --errors-only twscrape
test: test:
pytest --cov=twscrape tests/ pytest -s --cov=twscrape tests/
get-cov:
coverage report -m
act: act:
act --container-architecture linux/amd64 act --container-architecture linux/amd64

Просмотреть файл

@ -75,8 +75,13 @@ if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())
``` ```
You can use `login_all` once in your program to pass the login flow and add the accounts to the database. Re-runs will use the previously activated accounts. Note on rate limits:
- Search API – 250 requests per account / 15 minites
- GraphQL API – 500 requests per account per operation / 15 minutes
### Models ### Models
- [Tweet](https://github.com/vladkens/twscrape/blob/main/twscrape/models.py#:~:text=class%20Tweet) - [Tweet](https://github.com/vladkens/twscrape/blob/main/twscrape/models.py#:~:text=class%20Tweet)
- [User](https://github.com/vladkens/twscrape/blob/main/twscrape/models.py#:~:text=class%20User) - [User](https://github.com/vladkens/twscrape/blob/main/twscrape/models.py#:~:text=class%20User)
### Related
- [SNScrape](https://github.com/JustAnotherArchivist/snscrape) – is a scraper for social networking services (SNS)

Просмотреть файл

@ -65,8 +65,14 @@ async def test_search():
obj = doc.dict() obj = doc.dict()
assert doc.id == obj["id"] assert doc.id == obj["id"]
assert doc.user.id == obj["user"]["id"] assert doc.user.id == obj["user"]["id"]
assert "url" in obj assert "url" in obj
assert "_type" in obj
assert obj["_type"] == "snscrape.modules.twitter.Tweet"
assert "url" in obj["user"] assert "url" in obj["user"]
assert "_type" in obj["user"]
assert obj["user"]["_type"] == "snscrape.modules.twitter.User"
txt = doc.json() txt = doc.json()
assert isinstance(txt, str) assert isinstance(txt, str)

Просмотреть файл

@ -2,8 +2,9 @@ import os
from twscrape.accounts_pool import AccountsPool from twscrape.accounts_pool import AccountsPool
from twscrape.db import DB from twscrape.db import DB
from twscrape.utils import utc_ts
DB_FILE = "/tmp/twapi_test.db" DB_FILE = "/tmp/twscrape_test.db"
def remove_db(): def remove_db():
@ -105,3 +106,58 @@ async def test_get_for_queue():
# should return None # should return None
acc = await pool.get_for_queue(Q) acc = await pool.get_for_queue(Q)
assert acc is None assert acc is None
async def test_account_unlock():
remove_db()
pool = AccountsPool(DB_FILE)
Q = "test_queue"
await pool.add_account("user1", "pass1", "email1", "email_pass1")
await pool.set_active("user1", True)
acc = await pool.get_for_queue(Q)
assert acc is not None
assert acc.locks[Q] is not None
# should unlock account and make available for queue
await pool.unlock(acc.username, Q)
acc = await pool.get_for_queue(Q)
assert acc is not None
assert acc.locks[Q] is not None
# should update lock time
end_time = utc_ts() + 60 # + 1 minute
await pool.lock_until(acc.username, Q, end_time)
acc = await pool.get(acc.username)
assert int(acc.locks[Q].timestamp()) == end_time
async def test_get_stats():
remove_db()
pool = AccountsPool(DB_FILE)
Q = "search"
# should return empty stats
stats = await pool.stats()
for k, v in stats.items():
assert v == 0, f"{k} should be 0"
# should increate total
await pool.add_account("user1", "pass1", "email1", "email_pass1")
stats = await pool.stats()
assert stats["total"] == 1
assert stats["active"] == 0
# should increate active
await pool.set_active("user1", True)
stats = await pool.stats()
assert stats["total"] == 1
assert stats["active"] == 1
# should update queue stats
await pool.get_for_queue(Q)
stats = await pool.stats()
assert stats["total"] == 1
assert stats["active"] == 1
assert stats["locked_search"] == 1

Просмотреть файл

@ -6,6 +6,7 @@ from datetime import datetime
from httpx import AsyncClient, AsyncHTTPTransport from httpx import AsyncClient, AsyncHTTPTransport
from .constants import TOKEN from .constants import TOKEN
from .utils import from_utciso
@dataclass @dataclass
@ -43,7 +44,7 @@ class Account:
@staticmethod @staticmethod
def from_rs(rs: sqlite3.Row): def from_rs(rs: sqlite3.Row):
doc = dict(rs) doc = dict(rs)
doc["locks"] = {k: datetime.fromisoformat(v) for k, v in json.loads(doc["locks"]).items()} doc["locks"] = {k: from_utciso(v) for k, v in json.loads(doc["locks"]).items()}
doc["headers"] = json.loads(doc["headers"]) doc["headers"] = json.loads(doc["headers"])
doc["cookies"] = json.loads(doc["cookies"]) doc["cookies"] = json.loads(doc["cookies"])
doc["active"] = bool(doc["active"]) doc["active"] = bool(doc["active"])

Просмотреть файл

@ -88,7 +88,6 @@ class AccountsPool:
await execute(self._db_file, qs, {"username": username, "active": active}) await execute(self._db_file, qs, {"username": username, "active": active})
async def lock_until(self, username: str, queue: str, unlock_at: int): async def lock_until(self, username: str, queue: str, unlock_at: int):
# unlock_at is unix timestamp
qs = f""" qs = f"""
UPDATE accounts SET locks = json_set(locks, '$.{queue}', datetime({unlock_at}, 'unixepoch')) UPDATE accounts SET locks = json_set(locks, '$.{queue}', datetime({unlock_at}, 'unixepoch'))
WHERE username = :username WHERE username = :username
@ -141,11 +140,18 @@ class AccountsPool:
WHERE json_extract(locks, '$.{queue}') IS NOT NULL AND json_extract(locks, '$.{queue}') > datetime('now') WHERE json_extract(locks, '$.{queue}') IS NOT NULL AND json_extract(locks, '$.{queue}') > datetime('now')
""" """
gql_ops = """
UserByRestId UserByScreenName TweetDetail Followers Following
Retweeters Favoriters UserTweets UserTweetsAndReplies
"""
gql_ops = [x.strip() for x in gql_ops.split(" ") if x.strip()]
config = [ config = [
("total", "SELECT COUNT(*) FROM accounts"), ("total", "SELECT COUNT(*) FROM accounts"),
("active", "SELECT COUNT(*) FROM accounts WHERE active = true"), ("active", "SELECT COUNT(*) FROM accounts WHERE active = true"),
("inactive", "SELECT COUNT(*) FROM accounts WHERE active = false"), ("inactive", "SELECT COUNT(*) FROM accounts WHERE active = false"),
("locked_search", by_queue("search")), ("locked_search", by_queue("search")),
*[(f"locked_{x}", by_queue(x)) for x in gql_ops],
] ]
qs = f"SELECT {','.join([f'({q}) as {k}' for k, q in config])}" qs = f"SELECT {','.join([f'({q}) as {k}' for k, q in config])}"

Просмотреть файл

@ -1,5 +1,4 @@
import json import json
import time
from datetime import datetime from datetime import datetime
from typing import Awaitable, Callable from typing import Awaitable, Callable
@ -9,7 +8,7 @@ from .accounts_pool import AccountsPool
from .constants import GQL_FEATURES, GQL_URL, SEARCH_PARAMS, SEARCH_URL from .constants import GQL_FEATURES, GQL_URL, SEARCH_PARAMS, SEARCH_URL
from .logger import logger from .logger import logger
from .models import Tweet, User from .models import Tweet, User
from .utils import encode_params, find_obj, get_by_path, to_old_obj, to_search_like from .utils import encode_params, find_obj, get_by_path, to_old_obj, to_old_rep, utc_ts
class API: class API:
@ -95,8 +94,8 @@ class API:
# possible account banned # possible account banned
if rep.status_code == 403: if rep.status_code == 403:
logger.debug(f"Ban for {log_id}") logger.warning(f"403 for {log_id}")
reset_ts = int(time.time() + 60 * 60) # 1 hour reset_ts = utc_ts() + 60 * 60 # + 1 hour
await self.pool.lock_until(acc.username, queue, reset_ts) await self.pool.lock_until(acc.username, queue, reset_ts)
continue continue
@ -256,7 +255,7 @@ class API:
async def tweet_details(self, twid: int): async def tweet_details(self, twid: int):
rep = await self.tweet_details_raw(twid) rep = await self.tweet_details_raw(twid)
obj = to_search_like(rep.json()) obj = to_old_rep(rep.json())
return Tweet.parse(obj["tweets"][str(twid)], obj) return Tweet.parse(obj["tweets"][str(twid)], obj)
# followers # followers
@ -269,7 +268,7 @@ class API:
async def followers(self, uid: int, limit=-1): async def followers(self, uid: int, limit=-1):
async for rep in self.followers_raw(uid, limit=limit): async for rep in self.followers_raw(uid, limit=limit):
obj = to_search_like(rep.json()) obj = to_old_rep(rep.json())
for _, v in obj["users"].items(): for _, v in obj["users"].items():
yield User.parse(v) yield User.parse(v)
@ -283,7 +282,7 @@ class API:
async def following(self, uid: int, limit=-1): async def following(self, uid: int, limit=-1):
async for rep in self.following_raw(uid, limit=limit): async for rep in self.following_raw(uid, limit=limit):
obj = to_search_like(rep.json()) obj = to_old_rep(rep.json())
for _, v in obj["users"].items(): for _, v in obj["users"].items():
yield User.parse(v) yield User.parse(v)
@ -297,7 +296,7 @@ class API:
async def retweeters(self, twid: int, limit=-1): async def retweeters(self, twid: int, limit=-1):
async for rep in self.retweeters_raw(twid, limit=limit): async for rep in self.retweeters_raw(twid, limit=limit):
obj = to_search_like(rep.json()) obj = to_old_rep(rep.json())
for _, v in obj["users"].items(): for _, v in obj["users"].items():
yield User.parse(v) yield User.parse(v)
@ -311,7 +310,7 @@ class API:
async def favoriters(self, twid: int, limit=-1): async def favoriters(self, twid: int, limit=-1):
async for rep in self.favoriters_raw(twid, limit=limit): async for rep in self.favoriters_raw(twid, limit=limit):
obj = to_search_like(rep.json()) obj = to_old_rep(rep.json())
for _, v in obj["users"].items(): for _, v in obj["users"].items():
yield User.parse(v) yield User.parse(v)
@ -332,7 +331,7 @@ class API:
async def user_tweets(self, uid: int, limit=-1): async def user_tweets(self, uid: int, limit=-1):
async for rep in self.user_tweets_raw(uid, limit=limit): async for rep in self.user_tweets_raw(uid, limit=limit):
obj = to_search_like(rep.json()) obj = to_old_rep(rep.json())
for _, v in obj["tweets"].items(): for _, v in obj["tweets"].items():
yield Tweet.parse(v, obj) yield Tweet.parse(v, obj)
@ -353,6 +352,6 @@ class API:
async def user_tweets_and_replies(self, uid: int, limit=-1): async def user_tweets_and_replies(self, uid: int, limit=-1):
async for rep in self.user_tweets_and_replies_raw(uid, limit=limit): async for rep in self.user_tweets_and_replies_raw(uid, limit=limit):
obj = to_search_like(rep.json()) obj = to_old_rep(rep.json())
for _, v in obj["tweets"].items(): for _, v in obj["tweets"].items():
yield Tweet.parse(v, obj) yield Tweet.parse(v, obj)

Просмотреть файл

@ -5,7 +5,6 @@ from dataclasses import asdict, dataclass
from datetime import datetime from datetime import datetime
from typing import Optional from typing import Optional
from .logger import logger
from .utils import find_item, get_or, int_or_none from .utils import find_item, get_or, int_or_none
@ -77,6 +76,7 @@ class UserRef(JSONTrait):
id: int id: int
username: str username: str
displayname: str displayname: str
_type: str = "snscrape.modules.twitter.UserRef"
@staticmethod @staticmethod
def parse(obj: dict): def parse(obj: dict):
@ -102,7 +102,9 @@ class User(JSONTrait):
profileBannerUrl: str | None = None profileBannerUrl: str | None = None
protected: bool | None = None protected: bool | None = None
verified: bool | None = None verified: bool | None = None
_type: str = "snscrape.modules.twitter.User"
# todo:
# descriptionLinks: typing.Optional[typing.List[TextLink]] = None # descriptionLinks: typing.Optional[typing.List[TextLink]] = None
# link: typing.Optional[TextLink] = None # link: typing.Optional[TextLink] = None
# label: typing.Optional["UserLabel"] = None # label: typing.Optional["UserLabel"] = None
@ -157,7 +159,9 @@ class Tweet(JSONTrait):
source: str | None = None source: str | None = None
sourceUrl: str | None = None sourceUrl: str | None = None
sourceLabel: str | None = None sourceLabel: str | None = None
_type: str = "snscrape.modules.twitter.Tweet"
# todo:
# renderedContent: str # renderedContent: str
# media: typing.Optional[typing.List["Medium"]] = None # media: typing.Optional[typing.List["Medium"]] = None
# card: typing.Optional["Card"] = None # card: typing.Optional["Card"] = None
@ -211,7 +215,7 @@ def _get_reply_user(tw_obj: dict, res: dict):
if mention: if mention:
return UserRef.parse(mention) return UserRef.parse(mention)
logger.debug(f'{tw_obj["in_reply_to_user_id_str"]}\n{json.dumps(res)}') # todo: user not found in reply (probably deleted or hidden)
return None return None

Просмотреть файл

@ -1,5 +1,6 @@
import json import json
from collections import defaultdict from collections import defaultdict
from datetime import datetime, timezone
from typing import Any, AsyncGenerator, Callable, TypeVar from typing import Any, AsyncGenerator, Callable, TypeVar
from httpx import HTTPStatusError, Response from httpx import HTTPStatusError, Response
@ -123,7 +124,7 @@ def to_old_obj(obj: dict):
return {**obj, **obj["legacy"], "id_str": str(obj["rest_id"]), "id": int(obj["rest_id"])} return {**obj, **obj["legacy"], "id_str": str(obj["rest_id"]), "id": int(obj["rest_id"])}
def to_search_like(obj: dict): def to_old_rep(obj: dict):
tmp = get_typed_object(obj, defaultdict(list)) tmp = get_typed_object(obj, defaultdict(list))
tweets = [x for x in tmp.get("Tweet", []) if "legacy" in x] tweets = [x for x in tmp.get("Tweet", []) if "legacy" in x]
@ -133,3 +134,11 @@ def to_search_like(obj: dict):
users = {str(x["rest_id"]): to_old_obj(x) for x in users} users = {str(x["rest_id"]): to_old_obj(x) for x in users}
return {"tweets": tweets, "users": users} return {"tweets": tweets, "users": users}
def utc_ts() -> int:
return int(datetime.utcnow().replace(tzinfo=timezone.utc).timestamp())
def from_utciso(iso: str) -> datetime:
return datetime.fromisoformat(iso).replace(tzinfo=timezone.utc)