fix timezone issue; add _type to models

Этот коммит содержится в:
Vlad Pronsky 2023-05-09 22:23:22 +03:00
родитель 799f6c330a
Коммит 2587a62f3e
9 изменённых файлов: 111 добавлений и 20 удалений

Просмотреть файл

@ -8,15 +8,20 @@ build:
lint:
ruff check twscrape
ruff check tests
lint-fix:
ruff check --fix twscrape
ruff check --fix tests
pylint:
pylint --errors-only twscrape
test:
pytest --cov=twscrape tests/
pytest -s --cov=twscrape tests/
get-cov:
coverage report -m
act:
act --container-architecture linux/amd64

Просмотреть файл

@ -75,8 +75,13 @@ if __name__ == "__main__":
asyncio.run(main())
```
You can use `login_all` once in your program to pass the login flow and add the accounts to the database. Re-runs will use the previously activated accounts.
Note on rate limits:
- Search API – 250 requests per account / 15 minites
- GraphQL API – 500 requests per account per operation / 15 minutes
### Models
- [Tweet](https://github.com/vladkens/twscrape/blob/main/twscrape/models.py#:~:text=class%20Tweet)
- [User](https://github.com/vladkens/twscrape/blob/main/twscrape/models.py#:~:text=class%20User)
- [User](https://github.com/vladkens/twscrape/blob/main/twscrape/models.py#:~:text=class%20User)
### Related
- [SNScrape](https://github.com/JustAnotherArchivist/snscrape) – is a scraper for social networking services (SNS)

Просмотреть файл

@ -65,8 +65,14 @@ async def test_search():
obj = doc.dict()
assert doc.id == obj["id"]
assert doc.user.id == obj["user"]["id"]
assert "url" in obj
assert "_type" in obj
assert obj["_type"] == "snscrape.modules.twitter.Tweet"
assert "url" in obj["user"]
assert "_type" in obj["user"]
assert obj["user"]["_type"] == "snscrape.modules.twitter.User"
txt = doc.json()
assert isinstance(txt, str)

Просмотреть файл

@ -2,8 +2,9 @@ import os
from twscrape.accounts_pool import AccountsPool
from twscrape.db import DB
from twscrape.utils import utc_ts
DB_FILE = "/tmp/twapi_test.db"
DB_FILE = "/tmp/twscrape_test.db"
def remove_db():
@ -105,3 +106,58 @@ async def test_get_for_queue():
# should return None
acc = await pool.get_for_queue(Q)
assert acc is None
async def test_account_unlock():
remove_db()
pool = AccountsPool(DB_FILE)
Q = "test_queue"
await pool.add_account("user1", "pass1", "email1", "email_pass1")
await pool.set_active("user1", True)
acc = await pool.get_for_queue(Q)
assert acc is not None
assert acc.locks[Q] is not None
# should unlock account and make available for queue
await pool.unlock(acc.username, Q)
acc = await pool.get_for_queue(Q)
assert acc is not None
assert acc.locks[Q] is not None
# should update lock time
end_time = utc_ts() + 60 # + 1 minute
await pool.lock_until(acc.username, Q, end_time)
acc = await pool.get(acc.username)
assert int(acc.locks[Q].timestamp()) == end_time
async def test_get_stats():
remove_db()
pool = AccountsPool(DB_FILE)
Q = "search"
# should return empty stats
stats = await pool.stats()
for k, v in stats.items():
assert v == 0, f"{k} should be 0"
# should increate total
await pool.add_account("user1", "pass1", "email1", "email_pass1")
stats = await pool.stats()
assert stats["total"] == 1
assert stats["active"] == 0
# should increate active
await pool.set_active("user1", True)
stats = await pool.stats()
assert stats["total"] == 1
assert stats["active"] == 1
# should update queue stats
await pool.get_for_queue(Q)
stats = await pool.stats()
assert stats["total"] == 1
assert stats["active"] == 1
assert stats["locked_search"] == 1

Просмотреть файл

@ -6,6 +6,7 @@ from datetime import datetime
from httpx import AsyncClient, AsyncHTTPTransport
from .constants import TOKEN
from .utils import from_utciso
@dataclass
@ -43,7 +44,7 @@ class Account:
@staticmethod
def from_rs(rs: sqlite3.Row):
doc = dict(rs)
doc["locks"] = {k: datetime.fromisoformat(v) for k, v in json.loads(doc["locks"]).items()}
doc["locks"] = {k: from_utciso(v) for k, v in json.loads(doc["locks"]).items()}
doc["headers"] = json.loads(doc["headers"])
doc["cookies"] = json.loads(doc["cookies"])
doc["active"] = bool(doc["active"])

Просмотреть файл

@ -88,7 +88,6 @@ class AccountsPool:
await execute(self._db_file, qs, {"username": username, "active": active})
async def lock_until(self, username: str, queue: str, unlock_at: int):
# unlock_at is unix timestamp
qs = f"""
UPDATE accounts SET locks = json_set(locks, '$.{queue}', datetime({unlock_at}, 'unixepoch'))
WHERE username = :username
@ -141,11 +140,18 @@ class AccountsPool:
WHERE json_extract(locks, '$.{queue}') IS NOT NULL AND json_extract(locks, '$.{queue}') > datetime('now')
"""
gql_ops = """
UserByRestId UserByScreenName TweetDetail Followers Following
Retweeters Favoriters UserTweets UserTweetsAndReplies
"""
gql_ops = [x.strip() for x in gql_ops.split(" ") if x.strip()]
config = [
("total", "SELECT COUNT(*) FROM accounts"),
("active", "SELECT COUNT(*) FROM accounts WHERE active = true"),
("inactive", "SELECT COUNT(*) FROM accounts WHERE active = false"),
("locked_search", by_queue("search")),
*[(f"locked_{x}", by_queue(x)) for x in gql_ops],
]
qs = f"SELECT {','.join([f'({q}) as {k}' for k, q in config])}"

Просмотреть файл

@ -1,5 +1,4 @@
import json
import time
from datetime import datetime
from typing import Awaitable, Callable
@ -9,7 +8,7 @@ from .accounts_pool import AccountsPool
from .constants import GQL_FEATURES, GQL_URL, SEARCH_PARAMS, SEARCH_URL
from .logger import logger
from .models import Tweet, User
from .utils import encode_params, find_obj, get_by_path, to_old_obj, to_search_like
from .utils import encode_params, find_obj, get_by_path, to_old_obj, to_old_rep, utc_ts
class API:
@ -95,8 +94,8 @@ class API:
# possible account banned
if rep.status_code == 403:
logger.debug(f"Ban for {log_id}")
reset_ts = int(time.time() + 60 * 60) # 1 hour
logger.warning(f"403 for {log_id}")
reset_ts = utc_ts() + 60 * 60 # + 1 hour
await self.pool.lock_until(acc.username, queue, reset_ts)
continue
@ -256,7 +255,7 @@ class API:
async def tweet_details(self, twid: int):
rep = await self.tweet_details_raw(twid)
obj = to_search_like(rep.json())
obj = to_old_rep(rep.json())
return Tweet.parse(obj["tweets"][str(twid)], obj)
# followers
@ -269,7 +268,7 @@ class API:
async def followers(self, uid: int, limit=-1):
async for rep in self.followers_raw(uid, limit=limit):
obj = to_search_like(rep.json())
obj = to_old_rep(rep.json())
for _, v in obj["users"].items():
yield User.parse(v)
@ -283,7 +282,7 @@ class API:
async def following(self, uid: int, limit=-1):
async for rep in self.following_raw(uid, limit=limit):
obj = to_search_like(rep.json())
obj = to_old_rep(rep.json())
for _, v in obj["users"].items():
yield User.parse(v)
@ -297,7 +296,7 @@ class API:
async def retweeters(self, twid: int, limit=-1):
async for rep in self.retweeters_raw(twid, limit=limit):
obj = to_search_like(rep.json())
obj = to_old_rep(rep.json())
for _, v in obj["users"].items():
yield User.parse(v)
@ -311,7 +310,7 @@ class API:
async def favoriters(self, twid: int, limit=-1):
async for rep in self.favoriters_raw(twid, limit=limit):
obj = to_search_like(rep.json())
obj = to_old_rep(rep.json())
for _, v in obj["users"].items():
yield User.parse(v)
@ -332,7 +331,7 @@ class API:
async def user_tweets(self, uid: int, limit=-1):
async for rep in self.user_tweets_raw(uid, limit=limit):
obj = to_search_like(rep.json())
obj = to_old_rep(rep.json())
for _, v in obj["tweets"].items():
yield Tweet.parse(v, obj)
@ -353,6 +352,6 @@ class API:
async def user_tweets_and_replies(self, uid: int, limit=-1):
async for rep in self.user_tweets_and_replies_raw(uid, limit=limit):
obj = to_search_like(rep.json())
obj = to_old_rep(rep.json())
for _, v in obj["tweets"].items():
yield Tweet.parse(v, obj)

Просмотреть файл

@ -5,7 +5,6 @@ from dataclasses import asdict, dataclass
from datetime import datetime
from typing import Optional
from .logger import logger
from .utils import find_item, get_or, int_or_none
@ -77,6 +76,7 @@ class UserRef(JSONTrait):
id: int
username: str
displayname: str
_type: str = "snscrape.modules.twitter.UserRef"
@staticmethod
def parse(obj: dict):
@ -102,7 +102,9 @@ class User(JSONTrait):
profileBannerUrl: str | None = None
protected: bool | None = None
verified: bool | None = None
_type: str = "snscrape.modules.twitter.User"
# todo:
# descriptionLinks: typing.Optional[typing.List[TextLink]] = None
# link: typing.Optional[TextLink] = None
# label: typing.Optional["UserLabel"] = None
@ -157,7 +159,9 @@ class Tweet(JSONTrait):
source: str | None = None
sourceUrl: str | None = None
sourceLabel: str | None = None
_type: str = "snscrape.modules.twitter.Tweet"
# todo:
# renderedContent: str
# media: typing.Optional[typing.List["Medium"]] = None
# card: typing.Optional["Card"] = None
@ -211,7 +215,7 @@ def _get_reply_user(tw_obj: dict, res: dict):
if mention:
return UserRef.parse(mention)
logger.debug(f'{tw_obj["in_reply_to_user_id_str"]}\n{json.dumps(res)}')
# todo: user not found in reply (probably deleted or hidden)
return None

Просмотреть файл

@ -1,5 +1,6 @@
import json
from collections import defaultdict
from datetime import datetime, timezone
from typing import Any, AsyncGenerator, Callable, TypeVar
from httpx import HTTPStatusError, Response
@ -123,7 +124,7 @@ def to_old_obj(obj: dict):
return {**obj, **obj["legacy"], "id_str": str(obj["rest_id"]), "id": int(obj["rest_id"])}
def to_search_like(obj: dict):
def to_old_rep(obj: dict):
tmp = get_typed_object(obj, defaultdict(list))
tweets = [x for x in tmp.get("Tweet", []) if "legacy" in x]
@ -133,3 +134,11 @@ def to_search_like(obj: dict):
users = {str(x["rest_id"]): to_old_obj(x) for x in users}
return {"tweets": tweets, "users": users}
def utc_ts() -> int:
return int(datetime.utcnow().replace(tzinfo=timezone.utc).timestamp())
def from_utciso(iso: str) -> datetime:
return datetime.fromisoformat(iso).replace(tzinfo=timezone.utc)