зеркало из
https://github.com/viginum-datalab/twscrape.git
synced 2025-10-30 21:46:13 +02:00
fix timezone issue; add _type to models
Этот коммит содержится в:
родитель
799f6c330a
Коммит
2587a62f3e
7
Makefile
7
Makefile
@ -8,15 +8,20 @@ build:
|
|||||||
|
|
||||||
lint:
|
lint:
|
||||||
ruff check twscrape
|
ruff check twscrape
|
||||||
|
ruff check tests
|
||||||
|
|
||||||
lint-fix:
|
lint-fix:
|
||||||
ruff check --fix twscrape
|
ruff check --fix twscrape
|
||||||
|
ruff check --fix tests
|
||||||
|
|
||||||
pylint:
|
pylint:
|
||||||
pylint --errors-only twscrape
|
pylint --errors-only twscrape
|
||||||
|
|
||||||
test:
|
test:
|
||||||
pytest --cov=twscrape tests/
|
pytest -s --cov=twscrape tests/
|
||||||
|
|
||||||
|
get-cov:
|
||||||
|
coverage report -m
|
||||||
|
|
||||||
act:
|
act:
|
||||||
act --container-architecture linux/amd64
|
act --container-architecture linux/amd64
|
||||||
|
|||||||
@ -75,8 +75,13 @@ if __name__ == "__main__":
|
|||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
```
|
```
|
||||||
|
|
||||||
You can use `login_all` once in your program to pass the login flow and add the accounts to the database. Re-runs will use the previously activated accounts.
|
Note on rate limits:
|
||||||
|
- Search API – 250 requests per account / 15 minites
|
||||||
|
- GraphQL API – 500 requests per account per operation / 15 minutes
|
||||||
|
|
||||||
### Models
|
### Models
|
||||||
- [Tweet](https://github.com/vladkens/twscrape/blob/main/twscrape/models.py#:~:text=class%20Tweet)
|
- [Tweet](https://github.com/vladkens/twscrape/blob/main/twscrape/models.py#:~:text=class%20Tweet)
|
||||||
- [User](https://github.com/vladkens/twscrape/blob/main/twscrape/models.py#:~:text=class%20User)
|
- [User](https://github.com/vladkens/twscrape/blob/main/twscrape/models.py#:~:text=class%20User)
|
||||||
|
|
||||||
|
### Related
|
||||||
|
- [SNScrape](https://github.com/JustAnotherArchivist/snscrape) – is a scraper for social networking services (SNS)
|
||||||
|
|||||||
@ -65,8 +65,14 @@ async def test_search():
|
|||||||
obj = doc.dict()
|
obj = doc.dict()
|
||||||
assert doc.id == obj["id"]
|
assert doc.id == obj["id"]
|
||||||
assert doc.user.id == obj["user"]["id"]
|
assert doc.user.id == obj["user"]["id"]
|
||||||
|
|
||||||
assert "url" in obj
|
assert "url" in obj
|
||||||
|
assert "_type" in obj
|
||||||
|
assert obj["_type"] == "snscrape.modules.twitter.Tweet"
|
||||||
|
|
||||||
assert "url" in obj["user"]
|
assert "url" in obj["user"]
|
||||||
|
assert "_type" in obj["user"]
|
||||||
|
assert obj["user"]["_type"] == "snscrape.modules.twitter.User"
|
||||||
|
|
||||||
txt = doc.json()
|
txt = doc.json()
|
||||||
assert isinstance(txt, str)
|
assert isinstance(txt, str)
|
||||||
|
|||||||
@ -2,8 +2,9 @@ import os
|
|||||||
|
|
||||||
from twscrape.accounts_pool import AccountsPool
|
from twscrape.accounts_pool import AccountsPool
|
||||||
from twscrape.db import DB
|
from twscrape.db import DB
|
||||||
|
from twscrape.utils import utc_ts
|
||||||
|
|
||||||
DB_FILE = "/tmp/twapi_test.db"
|
DB_FILE = "/tmp/twscrape_test.db"
|
||||||
|
|
||||||
|
|
||||||
def remove_db():
|
def remove_db():
|
||||||
@ -105,3 +106,58 @@ async def test_get_for_queue():
|
|||||||
# should return None
|
# should return None
|
||||||
acc = await pool.get_for_queue(Q)
|
acc = await pool.get_for_queue(Q)
|
||||||
assert acc is None
|
assert acc is None
|
||||||
|
|
||||||
|
|
||||||
|
async def test_account_unlock():
|
||||||
|
remove_db()
|
||||||
|
pool = AccountsPool(DB_FILE)
|
||||||
|
Q = "test_queue"
|
||||||
|
|
||||||
|
await pool.add_account("user1", "pass1", "email1", "email_pass1")
|
||||||
|
await pool.set_active("user1", True)
|
||||||
|
acc = await pool.get_for_queue(Q)
|
||||||
|
assert acc is not None
|
||||||
|
assert acc.locks[Q] is not None
|
||||||
|
|
||||||
|
# should unlock account and make available for queue
|
||||||
|
await pool.unlock(acc.username, Q)
|
||||||
|
acc = await pool.get_for_queue(Q)
|
||||||
|
assert acc is not None
|
||||||
|
assert acc.locks[Q] is not None
|
||||||
|
|
||||||
|
# should update lock time
|
||||||
|
end_time = utc_ts() + 60 # + 1 minute
|
||||||
|
await pool.lock_until(acc.username, Q, end_time)
|
||||||
|
|
||||||
|
acc = await pool.get(acc.username)
|
||||||
|
assert int(acc.locks[Q].timestamp()) == end_time
|
||||||
|
|
||||||
|
|
||||||
|
async def test_get_stats():
|
||||||
|
remove_db()
|
||||||
|
pool = AccountsPool(DB_FILE)
|
||||||
|
Q = "search"
|
||||||
|
|
||||||
|
# should return empty stats
|
||||||
|
stats = await pool.stats()
|
||||||
|
for k, v in stats.items():
|
||||||
|
assert v == 0, f"{k} should be 0"
|
||||||
|
|
||||||
|
# should increate total
|
||||||
|
await pool.add_account("user1", "pass1", "email1", "email_pass1")
|
||||||
|
stats = await pool.stats()
|
||||||
|
assert stats["total"] == 1
|
||||||
|
assert stats["active"] == 0
|
||||||
|
|
||||||
|
# should increate active
|
||||||
|
await pool.set_active("user1", True)
|
||||||
|
stats = await pool.stats()
|
||||||
|
assert stats["total"] == 1
|
||||||
|
assert stats["active"] == 1
|
||||||
|
|
||||||
|
# should update queue stats
|
||||||
|
await pool.get_for_queue(Q)
|
||||||
|
stats = await pool.stats()
|
||||||
|
assert stats["total"] == 1
|
||||||
|
assert stats["active"] == 1
|
||||||
|
assert stats["locked_search"] == 1
|
||||||
|
|||||||
@ -6,6 +6,7 @@ from datetime import datetime
|
|||||||
from httpx import AsyncClient, AsyncHTTPTransport
|
from httpx import AsyncClient, AsyncHTTPTransport
|
||||||
|
|
||||||
from .constants import TOKEN
|
from .constants import TOKEN
|
||||||
|
from .utils import from_utciso
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -43,7 +44,7 @@ class Account:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def from_rs(rs: sqlite3.Row):
|
def from_rs(rs: sqlite3.Row):
|
||||||
doc = dict(rs)
|
doc = dict(rs)
|
||||||
doc["locks"] = {k: datetime.fromisoformat(v) for k, v in json.loads(doc["locks"]).items()}
|
doc["locks"] = {k: from_utciso(v) for k, v in json.loads(doc["locks"]).items()}
|
||||||
doc["headers"] = json.loads(doc["headers"])
|
doc["headers"] = json.loads(doc["headers"])
|
||||||
doc["cookies"] = json.loads(doc["cookies"])
|
doc["cookies"] = json.loads(doc["cookies"])
|
||||||
doc["active"] = bool(doc["active"])
|
doc["active"] = bool(doc["active"])
|
||||||
|
|||||||
@ -88,7 +88,6 @@ class AccountsPool:
|
|||||||
await execute(self._db_file, qs, {"username": username, "active": active})
|
await execute(self._db_file, qs, {"username": username, "active": active})
|
||||||
|
|
||||||
async def lock_until(self, username: str, queue: str, unlock_at: int):
|
async def lock_until(self, username: str, queue: str, unlock_at: int):
|
||||||
# unlock_at is unix timestamp
|
|
||||||
qs = f"""
|
qs = f"""
|
||||||
UPDATE accounts SET locks = json_set(locks, '$.{queue}', datetime({unlock_at}, 'unixepoch'))
|
UPDATE accounts SET locks = json_set(locks, '$.{queue}', datetime({unlock_at}, 'unixepoch'))
|
||||||
WHERE username = :username
|
WHERE username = :username
|
||||||
@ -141,11 +140,18 @@ class AccountsPool:
|
|||||||
WHERE json_extract(locks, '$.{queue}') IS NOT NULL AND json_extract(locks, '$.{queue}') > datetime('now')
|
WHERE json_extract(locks, '$.{queue}') IS NOT NULL AND json_extract(locks, '$.{queue}') > datetime('now')
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
gql_ops = """
|
||||||
|
UserByRestId UserByScreenName TweetDetail Followers Following
|
||||||
|
Retweeters Favoriters UserTweets UserTweetsAndReplies
|
||||||
|
"""
|
||||||
|
gql_ops = [x.strip() for x in gql_ops.split(" ") if x.strip()]
|
||||||
|
|
||||||
config = [
|
config = [
|
||||||
("total", "SELECT COUNT(*) FROM accounts"),
|
("total", "SELECT COUNT(*) FROM accounts"),
|
||||||
("active", "SELECT COUNT(*) FROM accounts WHERE active = true"),
|
("active", "SELECT COUNT(*) FROM accounts WHERE active = true"),
|
||||||
("inactive", "SELECT COUNT(*) FROM accounts WHERE active = false"),
|
("inactive", "SELECT COUNT(*) FROM accounts WHERE active = false"),
|
||||||
("locked_search", by_queue("search")),
|
("locked_search", by_queue("search")),
|
||||||
|
*[(f"locked_{x}", by_queue(x)) for x in gql_ops],
|
||||||
]
|
]
|
||||||
|
|
||||||
qs = f"SELECT {','.join([f'({q}) as {k}' for k, q in config])}"
|
qs = f"SELECT {','.join([f'({q}) as {k}' for k, q in config])}"
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
import json
|
import json
|
||||||
import time
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Awaitable, Callable
|
from typing import Awaitable, Callable
|
||||||
|
|
||||||
@ -9,7 +8,7 @@ from .accounts_pool import AccountsPool
|
|||||||
from .constants import GQL_FEATURES, GQL_URL, SEARCH_PARAMS, SEARCH_URL
|
from .constants import GQL_FEATURES, GQL_URL, SEARCH_PARAMS, SEARCH_URL
|
||||||
from .logger import logger
|
from .logger import logger
|
||||||
from .models import Tweet, User
|
from .models import Tweet, User
|
||||||
from .utils import encode_params, find_obj, get_by_path, to_old_obj, to_search_like
|
from .utils import encode_params, find_obj, get_by_path, to_old_obj, to_old_rep, utc_ts
|
||||||
|
|
||||||
|
|
||||||
class API:
|
class API:
|
||||||
@ -95,8 +94,8 @@ class API:
|
|||||||
|
|
||||||
# possible account banned
|
# possible account banned
|
||||||
if rep.status_code == 403:
|
if rep.status_code == 403:
|
||||||
logger.debug(f"Ban for {log_id}")
|
logger.warning(f"403 for {log_id}")
|
||||||
reset_ts = int(time.time() + 60 * 60) # 1 hour
|
reset_ts = utc_ts() + 60 * 60 # + 1 hour
|
||||||
await self.pool.lock_until(acc.username, queue, reset_ts)
|
await self.pool.lock_until(acc.username, queue, reset_ts)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -256,7 +255,7 @@ class API:
|
|||||||
|
|
||||||
async def tweet_details(self, twid: int):
|
async def tweet_details(self, twid: int):
|
||||||
rep = await self.tweet_details_raw(twid)
|
rep = await self.tweet_details_raw(twid)
|
||||||
obj = to_search_like(rep.json())
|
obj = to_old_rep(rep.json())
|
||||||
return Tweet.parse(obj["tweets"][str(twid)], obj)
|
return Tweet.parse(obj["tweets"][str(twid)], obj)
|
||||||
|
|
||||||
# followers
|
# followers
|
||||||
@ -269,7 +268,7 @@ class API:
|
|||||||
|
|
||||||
async def followers(self, uid: int, limit=-1):
|
async def followers(self, uid: int, limit=-1):
|
||||||
async for rep in self.followers_raw(uid, limit=limit):
|
async for rep in self.followers_raw(uid, limit=limit):
|
||||||
obj = to_search_like(rep.json())
|
obj = to_old_rep(rep.json())
|
||||||
for _, v in obj["users"].items():
|
for _, v in obj["users"].items():
|
||||||
yield User.parse(v)
|
yield User.parse(v)
|
||||||
|
|
||||||
@ -283,7 +282,7 @@ class API:
|
|||||||
|
|
||||||
async def following(self, uid: int, limit=-1):
|
async def following(self, uid: int, limit=-1):
|
||||||
async for rep in self.following_raw(uid, limit=limit):
|
async for rep in self.following_raw(uid, limit=limit):
|
||||||
obj = to_search_like(rep.json())
|
obj = to_old_rep(rep.json())
|
||||||
for _, v in obj["users"].items():
|
for _, v in obj["users"].items():
|
||||||
yield User.parse(v)
|
yield User.parse(v)
|
||||||
|
|
||||||
@ -297,7 +296,7 @@ class API:
|
|||||||
|
|
||||||
async def retweeters(self, twid: int, limit=-1):
|
async def retweeters(self, twid: int, limit=-1):
|
||||||
async for rep in self.retweeters_raw(twid, limit=limit):
|
async for rep in self.retweeters_raw(twid, limit=limit):
|
||||||
obj = to_search_like(rep.json())
|
obj = to_old_rep(rep.json())
|
||||||
for _, v in obj["users"].items():
|
for _, v in obj["users"].items():
|
||||||
yield User.parse(v)
|
yield User.parse(v)
|
||||||
|
|
||||||
@ -311,7 +310,7 @@ class API:
|
|||||||
|
|
||||||
async def favoriters(self, twid: int, limit=-1):
|
async def favoriters(self, twid: int, limit=-1):
|
||||||
async for rep in self.favoriters_raw(twid, limit=limit):
|
async for rep in self.favoriters_raw(twid, limit=limit):
|
||||||
obj = to_search_like(rep.json())
|
obj = to_old_rep(rep.json())
|
||||||
for _, v in obj["users"].items():
|
for _, v in obj["users"].items():
|
||||||
yield User.parse(v)
|
yield User.parse(v)
|
||||||
|
|
||||||
@ -332,7 +331,7 @@ class API:
|
|||||||
|
|
||||||
async def user_tweets(self, uid: int, limit=-1):
|
async def user_tweets(self, uid: int, limit=-1):
|
||||||
async for rep in self.user_tweets_raw(uid, limit=limit):
|
async for rep in self.user_tweets_raw(uid, limit=limit):
|
||||||
obj = to_search_like(rep.json())
|
obj = to_old_rep(rep.json())
|
||||||
for _, v in obj["tweets"].items():
|
for _, v in obj["tweets"].items():
|
||||||
yield Tweet.parse(v, obj)
|
yield Tweet.parse(v, obj)
|
||||||
|
|
||||||
@ -353,6 +352,6 @@ class API:
|
|||||||
|
|
||||||
async def user_tweets_and_replies(self, uid: int, limit=-1):
|
async def user_tweets_and_replies(self, uid: int, limit=-1):
|
||||||
async for rep in self.user_tweets_and_replies_raw(uid, limit=limit):
|
async for rep in self.user_tweets_and_replies_raw(uid, limit=limit):
|
||||||
obj = to_search_like(rep.json())
|
obj = to_old_rep(rep.json())
|
||||||
for _, v in obj["tweets"].items():
|
for _, v in obj["tweets"].items():
|
||||||
yield Tweet.parse(v, obj)
|
yield Tweet.parse(v, obj)
|
||||||
|
|||||||
@ -5,7 +5,6 @@ from dataclasses import asdict, dataclass
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from .logger import logger
|
|
||||||
from .utils import find_item, get_or, int_or_none
|
from .utils import find_item, get_or, int_or_none
|
||||||
|
|
||||||
|
|
||||||
@ -77,6 +76,7 @@ class UserRef(JSONTrait):
|
|||||||
id: int
|
id: int
|
||||||
username: str
|
username: str
|
||||||
displayname: str
|
displayname: str
|
||||||
|
_type: str = "snscrape.modules.twitter.UserRef"
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse(obj: dict):
|
def parse(obj: dict):
|
||||||
@ -102,7 +102,9 @@ class User(JSONTrait):
|
|||||||
profileBannerUrl: str | None = None
|
profileBannerUrl: str | None = None
|
||||||
protected: bool | None = None
|
protected: bool | None = None
|
||||||
verified: bool | None = None
|
verified: bool | None = None
|
||||||
|
_type: str = "snscrape.modules.twitter.User"
|
||||||
|
|
||||||
|
# todo:
|
||||||
# descriptionLinks: typing.Optional[typing.List[TextLink]] = None
|
# descriptionLinks: typing.Optional[typing.List[TextLink]] = None
|
||||||
# link: typing.Optional[TextLink] = None
|
# link: typing.Optional[TextLink] = None
|
||||||
# label: typing.Optional["UserLabel"] = None
|
# label: typing.Optional["UserLabel"] = None
|
||||||
@ -157,7 +159,9 @@ class Tweet(JSONTrait):
|
|||||||
source: str | None = None
|
source: str | None = None
|
||||||
sourceUrl: str | None = None
|
sourceUrl: str | None = None
|
||||||
sourceLabel: str | None = None
|
sourceLabel: str | None = None
|
||||||
|
_type: str = "snscrape.modules.twitter.Tweet"
|
||||||
|
|
||||||
|
# todo:
|
||||||
# renderedContent: str
|
# renderedContent: str
|
||||||
# media: typing.Optional[typing.List["Medium"]] = None
|
# media: typing.Optional[typing.List["Medium"]] = None
|
||||||
# card: typing.Optional["Card"] = None
|
# card: typing.Optional["Card"] = None
|
||||||
@ -211,7 +215,7 @@ def _get_reply_user(tw_obj: dict, res: dict):
|
|||||||
if mention:
|
if mention:
|
||||||
return UserRef.parse(mention)
|
return UserRef.parse(mention)
|
||||||
|
|
||||||
logger.debug(f'{tw_obj["in_reply_to_user_id_str"]}\n{json.dumps(res)}')
|
# todo: user not found in reply (probably deleted or hidden)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
from datetime import datetime, timezone
|
||||||
from typing import Any, AsyncGenerator, Callable, TypeVar
|
from typing import Any, AsyncGenerator, Callable, TypeVar
|
||||||
|
|
||||||
from httpx import HTTPStatusError, Response
|
from httpx import HTTPStatusError, Response
|
||||||
@ -123,7 +124,7 @@ def to_old_obj(obj: dict):
|
|||||||
return {**obj, **obj["legacy"], "id_str": str(obj["rest_id"]), "id": int(obj["rest_id"])}
|
return {**obj, **obj["legacy"], "id_str": str(obj["rest_id"]), "id": int(obj["rest_id"])}
|
||||||
|
|
||||||
|
|
||||||
def to_search_like(obj: dict):
|
def to_old_rep(obj: dict):
|
||||||
tmp = get_typed_object(obj, defaultdict(list))
|
tmp = get_typed_object(obj, defaultdict(list))
|
||||||
|
|
||||||
tweets = [x for x in tmp.get("Tweet", []) if "legacy" in x]
|
tweets = [x for x in tmp.get("Tweet", []) if "legacy" in x]
|
||||||
@ -133,3 +134,11 @@ def to_search_like(obj: dict):
|
|||||||
users = {str(x["rest_id"]): to_old_obj(x) for x in users}
|
users = {str(x["rest_id"]): to_old_obj(x) for x in users}
|
||||||
|
|
||||||
return {"tweets": tweets, "users": users}
|
return {"tweets": tweets, "users": users}
|
||||||
|
|
||||||
|
|
||||||
|
def utc_ts() -> int:
|
||||||
|
return int(datetime.utcnow().replace(tzinfo=timezone.utc).timestamp())
|
||||||
|
|
||||||
|
|
||||||
|
def from_utciso(iso: str) -> datetime:
|
||||||
|
return datetime.fromisoformat(iso).replace(tzinfo=timezone.utc)
|
||||||
|
|||||||
Загрузка…
x
Ссылка в новой задаче
Block a user