зеркало из
https://github.com/viginum-datalab/twscrape.git
synced 2025-10-29 13:06:13 +02:00
fix timezone issue; add _type to models
Этот коммит содержится в:
родитель
799f6c330a
Коммит
2587a62f3e
7
Makefile
7
Makefile
@ -8,15 +8,20 @@ build:
|
||||
|
||||
lint:
|
||||
ruff check twscrape
|
||||
ruff check tests
|
||||
|
||||
lint-fix:
|
||||
ruff check --fix twscrape
|
||||
ruff check --fix tests
|
||||
|
||||
pylint:
|
||||
pylint --errors-only twscrape
|
||||
|
||||
test:
|
||||
pytest --cov=twscrape tests/
|
||||
pytest -s --cov=twscrape tests/
|
||||
|
||||
get-cov:
|
||||
coverage report -m
|
||||
|
||||
act:
|
||||
act --container-architecture linux/amd64
|
||||
|
||||
@ -75,8 +75,13 @@ if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
You can use `login_all` once in your program to pass the login flow and add the accounts to the database. Re-runs will use the previously activated accounts.
|
||||
Note on rate limits:
|
||||
- Search API – 250 requests per account / 15 minites
|
||||
- GraphQL API – 500 requests per account per operation / 15 minutes
|
||||
|
||||
### Models
|
||||
- [Tweet](https://github.com/vladkens/twscrape/blob/main/twscrape/models.py#:~:text=class%20Tweet)
|
||||
- [User](https://github.com/vladkens/twscrape/blob/main/twscrape/models.py#:~:text=class%20User)
|
||||
- [User](https://github.com/vladkens/twscrape/blob/main/twscrape/models.py#:~:text=class%20User)
|
||||
|
||||
### Related
|
||||
- [SNScrape](https://github.com/JustAnotherArchivist/snscrape) – is a scraper for social networking services (SNS)
|
||||
|
||||
@ -65,8 +65,14 @@ async def test_search():
|
||||
obj = doc.dict()
|
||||
assert doc.id == obj["id"]
|
||||
assert doc.user.id == obj["user"]["id"]
|
||||
|
||||
assert "url" in obj
|
||||
assert "_type" in obj
|
||||
assert obj["_type"] == "snscrape.modules.twitter.Tweet"
|
||||
|
||||
assert "url" in obj["user"]
|
||||
assert "_type" in obj["user"]
|
||||
assert obj["user"]["_type"] == "snscrape.modules.twitter.User"
|
||||
|
||||
txt = doc.json()
|
||||
assert isinstance(txt, str)
|
||||
|
||||
@ -2,8 +2,9 @@ import os
|
||||
|
||||
from twscrape.accounts_pool import AccountsPool
|
||||
from twscrape.db import DB
|
||||
from twscrape.utils import utc_ts
|
||||
|
||||
DB_FILE = "/tmp/twapi_test.db"
|
||||
DB_FILE = "/tmp/twscrape_test.db"
|
||||
|
||||
|
||||
def remove_db():
|
||||
@ -105,3 +106,58 @@ async def test_get_for_queue():
|
||||
# should return None
|
||||
acc = await pool.get_for_queue(Q)
|
||||
assert acc is None
|
||||
|
||||
|
||||
async def test_account_unlock():
|
||||
remove_db()
|
||||
pool = AccountsPool(DB_FILE)
|
||||
Q = "test_queue"
|
||||
|
||||
await pool.add_account("user1", "pass1", "email1", "email_pass1")
|
||||
await pool.set_active("user1", True)
|
||||
acc = await pool.get_for_queue(Q)
|
||||
assert acc is not None
|
||||
assert acc.locks[Q] is not None
|
||||
|
||||
# should unlock account and make available for queue
|
||||
await pool.unlock(acc.username, Q)
|
||||
acc = await pool.get_for_queue(Q)
|
||||
assert acc is not None
|
||||
assert acc.locks[Q] is not None
|
||||
|
||||
# should update lock time
|
||||
end_time = utc_ts() + 60 # + 1 minute
|
||||
await pool.lock_until(acc.username, Q, end_time)
|
||||
|
||||
acc = await pool.get(acc.username)
|
||||
assert int(acc.locks[Q].timestamp()) == end_time
|
||||
|
||||
|
||||
async def test_get_stats():
|
||||
remove_db()
|
||||
pool = AccountsPool(DB_FILE)
|
||||
Q = "search"
|
||||
|
||||
# should return empty stats
|
||||
stats = await pool.stats()
|
||||
for k, v in stats.items():
|
||||
assert v == 0, f"{k} should be 0"
|
||||
|
||||
# should increate total
|
||||
await pool.add_account("user1", "pass1", "email1", "email_pass1")
|
||||
stats = await pool.stats()
|
||||
assert stats["total"] == 1
|
||||
assert stats["active"] == 0
|
||||
|
||||
# should increate active
|
||||
await pool.set_active("user1", True)
|
||||
stats = await pool.stats()
|
||||
assert stats["total"] == 1
|
||||
assert stats["active"] == 1
|
||||
|
||||
# should update queue stats
|
||||
await pool.get_for_queue(Q)
|
||||
stats = await pool.stats()
|
||||
assert stats["total"] == 1
|
||||
assert stats["active"] == 1
|
||||
assert stats["locked_search"] == 1
|
||||
|
||||
@ -6,6 +6,7 @@ from datetime import datetime
|
||||
from httpx import AsyncClient, AsyncHTTPTransport
|
||||
|
||||
from .constants import TOKEN
|
||||
from .utils import from_utciso
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -43,7 +44,7 @@ class Account:
|
||||
@staticmethod
|
||||
def from_rs(rs: sqlite3.Row):
|
||||
doc = dict(rs)
|
||||
doc["locks"] = {k: datetime.fromisoformat(v) for k, v in json.loads(doc["locks"]).items()}
|
||||
doc["locks"] = {k: from_utciso(v) for k, v in json.loads(doc["locks"]).items()}
|
||||
doc["headers"] = json.loads(doc["headers"])
|
||||
doc["cookies"] = json.loads(doc["cookies"])
|
||||
doc["active"] = bool(doc["active"])
|
||||
|
||||
@ -88,7 +88,6 @@ class AccountsPool:
|
||||
await execute(self._db_file, qs, {"username": username, "active": active})
|
||||
|
||||
async def lock_until(self, username: str, queue: str, unlock_at: int):
|
||||
# unlock_at is unix timestamp
|
||||
qs = f"""
|
||||
UPDATE accounts SET locks = json_set(locks, '$.{queue}', datetime({unlock_at}, 'unixepoch'))
|
||||
WHERE username = :username
|
||||
@ -141,11 +140,18 @@ class AccountsPool:
|
||||
WHERE json_extract(locks, '$.{queue}') IS NOT NULL AND json_extract(locks, '$.{queue}') > datetime('now')
|
||||
"""
|
||||
|
||||
gql_ops = """
|
||||
UserByRestId UserByScreenName TweetDetail Followers Following
|
||||
Retweeters Favoriters UserTweets UserTweetsAndReplies
|
||||
"""
|
||||
gql_ops = [x.strip() for x in gql_ops.split(" ") if x.strip()]
|
||||
|
||||
config = [
|
||||
("total", "SELECT COUNT(*) FROM accounts"),
|
||||
("active", "SELECT COUNT(*) FROM accounts WHERE active = true"),
|
||||
("inactive", "SELECT COUNT(*) FROM accounts WHERE active = false"),
|
||||
("locked_search", by_queue("search")),
|
||||
*[(f"locked_{x}", by_queue(x)) for x in gql_ops],
|
||||
]
|
||||
|
||||
qs = f"SELECT {','.join([f'({q}) as {k}' for k, q in config])}"
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
import json
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Awaitable, Callable
|
||||
|
||||
@ -9,7 +8,7 @@ from .accounts_pool import AccountsPool
|
||||
from .constants import GQL_FEATURES, GQL_URL, SEARCH_PARAMS, SEARCH_URL
|
||||
from .logger import logger
|
||||
from .models import Tweet, User
|
||||
from .utils import encode_params, find_obj, get_by_path, to_old_obj, to_search_like
|
||||
from .utils import encode_params, find_obj, get_by_path, to_old_obj, to_old_rep, utc_ts
|
||||
|
||||
|
||||
class API:
|
||||
@ -95,8 +94,8 @@ class API:
|
||||
|
||||
# possible account banned
|
||||
if rep.status_code == 403:
|
||||
logger.debug(f"Ban for {log_id}")
|
||||
reset_ts = int(time.time() + 60 * 60) # 1 hour
|
||||
logger.warning(f"403 for {log_id}")
|
||||
reset_ts = utc_ts() + 60 * 60 # + 1 hour
|
||||
await self.pool.lock_until(acc.username, queue, reset_ts)
|
||||
continue
|
||||
|
||||
@ -256,7 +255,7 @@ class API:
|
||||
|
||||
async def tweet_details(self, twid: int):
|
||||
rep = await self.tweet_details_raw(twid)
|
||||
obj = to_search_like(rep.json())
|
||||
obj = to_old_rep(rep.json())
|
||||
return Tweet.parse(obj["tweets"][str(twid)], obj)
|
||||
|
||||
# followers
|
||||
@ -269,7 +268,7 @@ class API:
|
||||
|
||||
async def followers(self, uid: int, limit=-1):
|
||||
async for rep in self.followers_raw(uid, limit=limit):
|
||||
obj = to_search_like(rep.json())
|
||||
obj = to_old_rep(rep.json())
|
||||
for _, v in obj["users"].items():
|
||||
yield User.parse(v)
|
||||
|
||||
@ -283,7 +282,7 @@ class API:
|
||||
|
||||
async def following(self, uid: int, limit=-1):
|
||||
async for rep in self.following_raw(uid, limit=limit):
|
||||
obj = to_search_like(rep.json())
|
||||
obj = to_old_rep(rep.json())
|
||||
for _, v in obj["users"].items():
|
||||
yield User.parse(v)
|
||||
|
||||
@ -297,7 +296,7 @@ class API:
|
||||
|
||||
async def retweeters(self, twid: int, limit=-1):
|
||||
async for rep in self.retweeters_raw(twid, limit=limit):
|
||||
obj = to_search_like(rep.json())
|
||||
obj = to_old_rep(rep.json())
|
||||
for _, v in obj["users"].items():
|
||||
yield User.parse(v)
|
||||
|
||||
@ -311,7 +310,7 @@ class API:
|
||||
|
||||
async def favoriters(self, twid: int, limit=-1):
|
||||
async for rep in self.favoriters_raw(twid, limit=limit):
|
||||
obj = to_search_like(rep.json())
|
||||
obj = to_old_rep(rep.json())
|
||||
for _, v in obj["users"].items():
|
||||
yield User.parse(v)
|
||||
|
||||
@ -332,7 +331,7 @@ class API:
|
||||
|
||||
async def user_tweets(self, uid: int, limit=-1):
|
||||
async for rep in self.user_tweets_raw(uid, limit=limit):
|
||||
obj = to_search_like(rep.json())
|
||||
obj = to_old_rep(rep.json())
|
||||
for _, v in obj["tweets"].items():
|
||||
yield Tweet.parse(v, obj)
|
||||
|
||||
@ -353,6 +352,6 @@ class API:
|
||||
|
||||
async def user_tweets_and_replies(self, uid: int, limit=-1):
|
||||
async for rep in self.user_tweets_and_replies_raw(uid, limit=limit):
|
||||
obj = to_search_like(rep.json())
|
||||
obj = to_old_rep(rep.json())
|
||||
for _, v in obj["tweets"].items():
|
||||
yield Tweet.parse(v, obj)
|
||||
|
||||
@ -5,7 +5,6 @@ from dataclasses import asdict, dataclass
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from .logger import logger
|
||||
from .utils import find_item, get_or, int_or_none
|
||||
|
||||
|
||||
@ -77,6 +76,7 @@ class UserRef(JSONTrait):
|
||||
id: int
|
||||
username: str
|
||||
displayname: str
|
||||
_type: str = "snscrape.modules.twitter.UserRef"
|
||||
|
||||
@staticmethod
|
||||
def parse(obj: dict):
|
||||
@ -102,7 +102,9 @@ class User(JSONTrait):
|
||||
profileBannerUrl: str | None = None
|
||||
protected: bool | None = None
|
||||
verified: bool | None = None
|
||||
_type: str = "snscrape.modules.twitter.User"
|
||||
|
||||
# todo:
|
||||
# descriptionLinks: typing.Optional[typing.List[TextLink]] = None
|
||||
# link: typing.Optional[TextLink] = None
|
||||
# label: typing.Optional["UserLabel"] = None
|
||||
@ -157,7 +159,9 @@ class Tweet(JSONTrait):
|
||||
source: str | None = None
|
||||
sourceUrl: str | None = None
|
||||
sourceLabel: str | None = None
|
||||
_type: str = "snscrape.modules.twitter.Tweet"
|
||||
|
||||
# todo:
|
||||
# renderedContent: str
|
||||
# media: typing.Optional[typing.List["Medium"]] = None
|
||||
# card: typing.Optional["Card"] = None
|
||||
@ -211,7 +215,7 @@ def _get_reply_user(tw_obj: dict, res: dict):
|
||||
if mention:
|
||||
return UserRef.parse(mention)
|
||||
|
||||
logger.debug(f'{tw_obj["in_reply_to_user_id_str"]}\n{json.dumps(res)}')
|
||||
# todo: user not found in reply (probably deleted or hidden)
|
||||
return None
|
||||
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, AsyncGenerator, Callable, TypeVar
|
||||
|
||||
from httpx import HTTPStatusError, Response
|
||||
@ -123,7 +124,7 @@ def to_old_obj(obj: dict):
|
||||
return {**obj, **obj["legacy"], "id_str": str(obj["rest_id"]), "id": int(obj["rest_id"])}
|
||||
|
||||
|
||||
def to_search_like(obj: dict):
|
||||
def to_old_rep(obj: dict):
|
||||
tmp = get_typed_object(obj, defaultdict(list))
|
||||
|
||||
tweets = [x for x in tmp.get("Tweet", []) if "legacy" in x]
|
||||
@ -133,3 +134,11 @@ def to_search_like(obj: dict):
|
||||
users = {str(x["rest_id"]): to_old_obj(x) for x in users}
|
||||
|
||||
return {"tweets": tweets, "users": users}
|
||||
|
||||
|
||||
def utc_ts() -> int:
|
||||
return int(datetime.utcnow().replace(tzinfo=timezone.utc).timestamp())
|
||||
|
||||
|
||||
def from_utciso(iso: str) -> datetime:
|
||||
return datetime.fromisoformat(iso).replace(tzinfo=timezone.utc)
|
||||
|
||||
Загрузка…
x
Ссылка в новой задаче
Block a user