Этот коммит содержится в:
Vlad Pronsky 2023-11-01 20:46:01 +02:00
родитель fd64ce2018
Коммит 6a232da016
17 изменённых файлов: 100 добавлений и 74 удалений

2
.github/workflows/test.yml поставляемый
Просмотреть файл

@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
python-version: ["3.10", "3.11"] python-version: ["3.10", "3.11", "3.12"]
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
- uses: actions/setup-python@v4 - uses: actions/setup-python@v4

1
.tool-versions Обычный файл
Просмотреть файл

@ -0,0 +1 @@
python 3.12.0

4
.vscode/settings.json поставляемый
Просмотреть файл

@ -8,7 +8,7 @@
}, },
"[python]": { "[python]": {
"editor.formatOnSave": true, "editor.formatOnSave": true,
"editor.codeActionsOnSave": ["source.organizeImports"] "editor.codeActionsOnSave": ["source.organizeImports"],
"editor.defaultFormatter": "ms-python.black-formatter"
}, },
"python.formatting.provider": "black"
} }

11
Dockerfile.python Обычный файл
Просмотреть файл

@ -0,0 +1,11 @@
ARG VER=3.12
FROM python:${VER}-alpine
RUN apk add git
WORKDIR /app
COPY pyproject.toml readme.md /app/
RUN pip install -e .[dev]
COPY . /app
CMD python --version; pytest tests/

Просмотреть файл

Просмотреть файл

@ -1,7 +1,5 @@
.PHONY: all build .PHONY: all build
SQTEST = docker -l warning build -f sqlite.dockerfile
all: all:
@echo "hi" @echo "hi"
@ -38,27 +36,33 @@ show-cov:
@coverage html @coverage html
@open htmlcov/index.html @open htmlcov/index.html
act:
@act --container-architecture linux/amd64
changelog: changelog:
@git pull origin --tags > /dev/null @git pull origin --tags > /dev/null
@git log $(shell git describe --tags --abbrev=0 HEAD)^..HEAD --pretty=format:'- %s' @git log $(shell git describe --tags --abbrev=0 HEAD)^..HEAD --pretty=format:'- %s'
test34: test-py:
$(eval name=twscrape_py$(v))
@docker -l warning build -f Dockerfile.python --build-arg VER=$(v) -t $(name) .
@docker run $(name)
test-sq:
$(eval name=twscrape_sq$(v))
@docker -l warning build -f Dockerfile.sqlite --build-arg SQLY=$(y) --build-arg SQLV=$(v) -t $(name) .
@docker run $(name)
test-py-matrix:
@make test-py v=3.10
@make test-py v=3.11
@make test-py v=3.12
test-sq-matrix:
@# https://www.sqlite.org/chronology.html @# https://www.sqlite.org/chronology.html
@$(SQTEST) --build-arg SQLY=2018 --build-arg SQLV=3240000 -t twscrape_sq24 . @make test-sq y=2018 v=3240000
@$(SQTEST) --build-arg SQLY=2019 --build-arg SQLV=3270200 -t twscrape_sq27 . @make test-sq y=2019 v=3270200
@$(SQTEST) --build-arg SQLY=2019 --build-arg SQLV=3300100 -t twscrape_sq30 . @make test-sq y=2019 v=3300100
@$(SQTEST) --build-arg SQLY=2020 --build-arg SQLV=3330000 -t twscrape_sq33 . @make test-sq y=2020 v=3330000
@$(SQTEST) --build-arg SQLY=2021 --build-arg SQLV=3340100 -t twscrape_sq34 . @make test-sq y=2021 v=3340100
@$(SQTEST) --build-arg SQLY=2023 --build-arg SQLV=3430000 -t twscrape_sq43 . @make test-sq y=2023 v=3430000
@docker run twscrape_sq24
@docker run twscrape_sq27
@docker run twscrape_sq30
@docker run twscrape_sq33
@docker run twscrape_sq34
@docker run twscrape_sq43
update-mocks: update-mocks:
twscrape user_by_id --raw 2244994945 | jq > ./tests/mocked-data/user_by_id_raw.json twscrape user_by_id --raw 2244994945 | jq > ./tests/mocked-data/user_by_id_raw.json

Просмотреть файл

@ -16,22 +16,23 @@ classifiers = [
'License :: OSI Approved :: MIT License', 'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.11',
'Programming Language :: Python :: 3.12',
] ]
dependencies = [ dependencies = [
"aiosqlite==0.17.0", "aiosqlite>=0.17.0",
"fake-useragent==1.2.1", "fake-useragent>=1.3.0",
"httpx==0.24.0", "httpx>=0.24.0",
"loguru==0.7.0", "loguru>=0.7.0",
] ]
[project.optional-dependencies] [project.optional-dependencies]
dev = [ dev = [
"pylint==2.17.3", "pylint>=2.17.3",
"pytest-asyncio==0.21.0", "pytest-asyncio>=0.21.0",
"pytest-cov==4.0.0", "pytest-cov>=4.0.0",
"pytest-httpx==0.22.0", "pytest-httpx>=0.22.0",
"pytest==7.3.1", "pytest>=7.4.0",
"ruff==0.0.263", "ruff"
] ]
[project.urls] [project.urls]
@ -46,13 +47,13 @@ packages = ['twscrape']
[tool.pylint] [tool.pylint]
max-line-length = 99 max-line-length = 99
disable = [ disable = [
"C0103", # invalid-name "C0103", # invalid-name
"C0114", # missing-module-docstring "C0114", # missing-module-docstring
"C0115", # missing-class-docstring "C0115", # missing-class-docstring
"C0116", # missing-function-docstring "C0116", # missing-function-docstring
"R0903", # too-few-public-methods "R0903", # too-few-public-methods
"R0913", # too-many-arguments "R0913", # too-many-arguments
"W0105", # pointless-string-statement "W0105", # pointless-string-statement
] ]
[tool.pytest.ini_options] [tool.pytest.ini_options]
@ -67,3 +68,6 @@ line-length = 99
[tool.ruff] [tool.ruff]
line-length = 99 line-length = 99
[tool.hatch.metadata]
allow-direct-references = true

Просмотреть файл

@ -10,9 +10,9 @@
<a href="https://github.com/vladkens/twscrape/actions"> <a href="https://github.com/vladkens/twscrape/actions">
<img src="https://github.com/vladkens/twscrape/workflows/test/badge.svg" alt="test status" /> <img src="https://github.com/vladkens/twscrape/workflows/test/badge.svg" alt="test status" />
</a> </a>
<!-- <a href="https://pypi.org/project/twscrape"> <a href="https://pypi.org/project/twscrape">
<img src="https://badgen.net/pypi/dm/twscrape" alt="downloads" /> <img src="https://badgen.net/pypi/dm/twscrape" alt="downloads" />
</a> --> </a>
<a href="https://github.com/vladkens/twscrape/blob/main/LICENSE"> <a href="https://github.com/vladkens/twscrape/blob/main/LICENSE">
<img src="https://badgen.net/github/license/vladkens/twscrape" alt="license" /> <img src="https://badgen.net/github/license/vladkens/twscrape" alt="license" />
</a> </a>

Просмотреть файл

@ -252,6 +252,7 @@ async def test_user_tweets_and_replies():
for doc in tweets: for doc in tweets:
check_tweet(doc) check_tweet(doc)
async def test_list_timeline(): async def test_list_timeline():
api = API() api = API()
mock_gen(api, "list_timeline_raw") mock_gen(api, "list_timeline_raw")

Просмотреть файл

@ -1,5 +1,5 @@
from twscrape.accounts_pool import AccountsPool from twscrape.accounts_pool import AccountsPool
from twscrape.utils import utc_ts from twscrape.utils import utc
async def test_add_accounts(pool_mock: AccountsPool): async def test_add_accounts(pool_mock: AccountsPool):
@ -102,7 +102,7 @@ async def test_account_unlock(pool_mock: AccountsPool):
assert acc.locks[Q] is not None assert acc.locks[Q] is not None
# should update lock time # should update lock time
end_time = utc_ts() + 60 # + 1 minute end_time = utc.ts() + 60 # + 1 minute
await pool_mock.lock_until(acc.username, Q, end_time) await pool_mock.lock_until(acc.username, Q, end_time)
acc = await pool_mock.get(acc.username) acc = await pool_mock.get(acc.username)

Просмотреть файл

@ -7,7 +7,7 @@ from httpx import AsyncClient, AsyncHTTPTransport
from .constants import TOKEN from .constants import TOKEN
from .models import JSONTrait from .models import JSONTrait
from .utils import from_utciso from .utils import utc
@dataclass @dataclass
@ -30,12 +30,12 @@ class Account(JSONTrait):
@staticmethod @staticmethod
def from_rs(rs: sqlite3.Row): def from_rs(rs: sqlite3.Row):
doc = dict(rs) doc = dict(rs)
doc["locks"] = {k: from_utciso(v) for k, v in json.loads(doc["locks"]).items()} doc["locks"] = {k: utc.from_iso(v) for k, v in json.loads(doc["locks"]).items()}
doc["stats"] = {k: v for k, v in json.loads(doc["stats"]).items() if isinstance(v, int)} doc["stats"] = {k: v for k, v in json.loads(doc["stats"]).items() if isinstance(v, int)}
doc["headers"] = json.loads(doc["headers"]) doc["headers"] = json.loads(doc["headers"])
doc["cookies"] = json.loads(doc["cookies"]) doc["cookies"] = json.loads(doc["cookies"])
doc["active"] = bool(doc["active"]) doc["active"] = bool(doc["active"])
doc["last_used"] = from_utciso(doc["last_used"]) if doc["last_used"] else None doc["last_used"] = utc.from_iso(doc["last_used"]) if doc["last_used"] else None
return Account(**doc) return Account(**doc)
def to_rs(self): def to_rs(self):

Просмотреть файл

@ -11,7 +11,7 @@ from .account import Account
from .db import execute, fetchall, fetchone from .db import execute, fetchall, fetchone
from .logger import logger from .logger import logger
from .login import login from .login import login
from .utils import parse_cookies, utc_ts from .utils import parse_cookies, utc
class AccountInfo(TypedDict): class AccountInfo(TypedDict):
@ -197,7 +197,7 @@ class AccountsPool:
UPDATE accounts SET UPDATE accounts SET
locks = json_set(locks, '$.{queue}', datetime({unlock_at}, 'unixepoch')), locks = json_set(locks, '$.{queue}', datetime({unlock_at}, 'unixepoch')),
stats = json_set(stats, '$.{queue}', COALESCE(json_extract(stats, '$.{queue}'), 0) + {req_count}), stats = json_set(stats, '$.{queue}', COALESCE(json_extract(stats, '$.{queue}'), 0) + {req_count}),
last_used = datetime({utc_ts()}, 'unixepoch') last_used = datetime({utc.ts()}, 'unixepoch')
WHERE username = :username WHERE username = :username
""" """
await execute(self._db_file, qs, {"username": username}) await execute(self._db_file, qs, {"username": username})
@ -207,7 +207,7 @@ class AccountsPool:
UPDATE accounts SET UPDATE accounts SET
locks = json_remove(locks, '$.{queue}'), locks = json_remove(locks, '$.{queue}'),
stats = json_set(stats, '$.{queue}', COALESCE(json_extract(stats, '$.{queue}'), 0) + {req_count}), stats = json_set(stats, '$.{queue}', COALESCE(json_extract(stats, '$.{queue}'), 0) + {req_count}),
last_used = datetime({utc_ts()}, 'unixepoch') last_used = datetime({utc.ts()}, 'unixepoch')
WHERE username = :username WHERE username = :username
""" """
await execute(self._db_file, qs, {"username": username}) await execute(self._db_file, qs, {"username": username})
@ -228,7 +228,7 @@ class AccountsPool:
qs = f""" qs = f"""
UPDATE accounts SET UPDATE accounts SET
locks = json_set(locks, '$.{queue}', datetime('now', '+15 minutes')), locks = json_set(locks, '$.{queue}', datetime('now', '+15 minutes')),
last_used = datetime({utc_ts()}, 'unixepoch') last_used = datetime({utc.ts()}, 'unixepoch')
WHERE username = ({q1}) WHERE username = ({q1})
RETURNING * RETURNING *
""" """
@ -238,7 +238,7 @@ class AccountsPool:
qs = f""" qs = f"""
UPDATE accounts SET UPDATE accounts SET
locks = json_set(locks, '$.{queue}', datetime('now', '+15 minutes')), locks = json_set(locks, '$.{queue}', datetime('now', '+15 minutes')),
last_used = datetime({utc_ts()}, 'unixepoch'), last_used = datetime({utc.ts()}, 'unixepoch'),
_tx = '{tx}' _tx = '{tx}'
WHERE username = ({q1}) WHERE username = ({q1})
""" """
@ -277,8 +277,7 @@ class AccountsPool:
""" """
rs = await fetchone(self._db_file, qs) rs = await fetchone(self._db_file, qs)
if rs: if rs:
now = datetime.utcnow().replace(tzinfo=timezone.utc) now, trg = utc.now(), utc.from_iso(rs[0])
trg = datetime.fromisoformat(rs[0]).replace(tzinfo=timezone.utc)
if trg < now: if trg < now:
return "now" return "now"

Просмотреть файл

@ -109,7 +109,7 @@ class API:
"hidden_profile_likes_enabled": True, "hidden_profile_likes_enabled": True,
"highlights_tweets_tab_ui_enabled": True, "highlights_tweets_tab_ui_enabled": True,
"creator_subscriptions_tweet_preview_api_enabled": True, "creator_subscriptions_tweet_preview_api_enabled": True,
"hidden_profile_subscriptions_enabled": True "hidden_profile_subscriptions_enabled": True,
} }
return await self._gql_item(op, kv, ft) return await self._gql_item(op, kv, ft)
@ -128,7 +128,7 @@ class API:
"creator_subscriptions_tweet_preview_api_enabled": True, "creator_subscriptions_tweet_preview_api_enabled": True,
"subscriptions_verification_info_verified_since_enabled": True, "subscriptions_verification_info_verified_since_enabled": True,
"hidden_profile_subscriptions_enabled": True, "hidden_profile_subscriptions_enabled": True,
"subscriptions_verification_info_is_identity_verified_enabled": False "subscriptions_verification_info_is_identity_verified_enabled": False,
} }
return await self._gql_item(op, kv, ft) return await self._gql_item(op, kv, ft)

Просмотреть файл

@ -1,4 +1,4 @@
from datetime import datetime, timedelta, timezone from datetime import timedelta
from httpx import AsyncClient, HTTPStatusError, Response from httpx import AsyncClient, HTTPStatusError, Response
@ -6,7 +6,7 @@ from .account import Account
from .constants import LOGIN_URL from .constants import LOGIN_URL
from .imap import imap_get_email_code, imap_login from .imap import imap_get_email_code, imap_login
from .logger import logger from .logger import logger
from .utils import raise_for_status from .utils import raise_for_status, utc
async def get_guest_token(client: AsyncClient): async def get_guest_token(client: AsyncClient):
@ -120,7 +120,7 @@ async def login_confirm_email_code(client: AsyncClient, acc: Account, prev: dict
if not imap: if not imap:
imap = await imap_login(acc.email, acc.email_password) imap = await imap_login(acc.email, acc.email_password)
now_time = datetime.now(timezone.utc) - timedelta(seconds=30) now_time = utc.now() - timedelta(seconds=30)
value = await imap_get_email_code(imap, acc.email, now_time) value = await imap_get_email_code(imap, acc.email, now_time)
payload = { payload = {

Просмотреть файл

@ -12,7 +12,7 @@ from typing import Generator, Optional
import httpx import httpx
from .logger import logger from .logger import logger
from .utils import find_item, get_or, int_or, to_old_rep from .utils import find_item, get_or, int_or, to_old_rep, utc
@dataclass @dataclass
@ -407,7 +407,7 @@ def _get_views(obj: dict, rt_obj: dict):
def _write_dump(kind: str, e: Exception, x: dict, obj: dict): def _write_dump(kind: str, e: Exception, x: dict, obj: dict):
uniq = "".join(random.choice(string.ascii_lowercase) for _ in range(5)) uniq = "".join(random.choice(string.ascii_lowercase) for _ in range(5))
time = datetime.utcnow().strftime("%Y-%m-%d_%H-%M-%S") time = utc.now().strftime("%Y-%m-%d_%H-%M-%S")
dumpfile = f"/tmp/twscrape/twscrape_parse_error_{time}_{uniq}.txt" dumpfile = f"/tmp/twscrape/twscrape_parse_error_{time}_{uniq}.txt"
os.makedirs(os.path.dirname(dumpfile), exist_ok=True) os.makedirs(os.path.dirname(dumpfile), exist_ok=True)

Просмотреть файл

@ -1,16 +1,15 @@
import json import json
import os import os
from datetime import datetime
from typing import Any from typing import Any
import httpx import httpx
from .accounts_pool import Account, AccountsPool from .accounts_pool import Account, AccountsPool
from .logger import logger from .logger import logger
from .utils import utc_ts from .utils import utc
ReqParams = dict[str, str | int] | None ReqParams = dict[str, str | int] | None
TMP_TS = datetime.utcnow().isoformat().split(".")[0].replace("T", "_").replace(":", "-")[0:16] TMP_TS = utc.now().isoformat().split(".")[0].replace("T", "_").replace(":", "-")[0:16]
class Ctx: class Ctx:
@ -39,6 +38,7 @@ class RateLimitError(Exception):
class BannedError(Exception): class BannedError(Exception):
pass pass
class DependencyError(Exception): class DependencyError(Exception):
pass pass
@ -151,7 +151,7 @@ class QueueClient:
# possible new limits for tweets view per account # possible new limits for tweets view per account
if msg.startswith("(88) Rate limit exceeded") or rep.status_code == 429: if msg.startswith("(88) Rate limit exceeded") or rep.status_code == 429:
await self._close_ctx(utc_ts() + 60 * 60 * 4) # lock for 4 hours await self._close_ctx(utc.ts() + 60 * 60 * 4) # lock for 4 hours
raise RateLimitError(msg) raise RateLimitError(msg)
if msg.startswith("(326) Authorization: Denied by access control"): if msg.startswith("(326) Authorization: Denied by access control"):
@ -163,7 +163,7 @@ class QueueClient:
# possible banned by old api flow # possible banned by old api flow
if rep.status_code in (401, 403): if rep.status_code in (401, 403):
await self._close_ctx(utc_ts() + 60 * 60 * 12) # lock for 12 hours await self._close_ctx(utc.ts() + 60 * 60 * 12) # lock for 12 hours
raise RateLimitError(msg) raise RateLimitError(msg)
# content not found # content not found
@ -196,7 +196,7 @@ class QueueClient:
except (RateLimitError, BannedError): except (RateLimitError, BannedError):
# already handled # already handled
continue continue
except (DependencyError): except DependencyError:
logger.error(f"Dependency error, returnning: {url}") logger.error(f"Dependency error, returnning: {url}")
return return
except (httpx.ReadTimeout, httpx.ProxyError): except (httpx.ReadTimeout, httpx.ProxyError):
@ -206,4 +206,4 @@ class QueueClient:
retry_count += 1 retry_count += 1
if retry_count >= 3: if retry_count >= 3:
logger.warning(f"Unknown error {type(e)}: {e}") logger.warning(f"Unknown error {type(e)}: {e}")
await self._close_ctx(utc_ts() + 60 * 15) # 15 minutes await self._close_ctx(utc.ts() + 60 * 15) # 15 minutes

Просмотреть файл

@ -11,6 +11,20 @@ from .logger import logger
T = TypeVar("T") T = TypeVar("T")
class utc:
@staticmethod
def now() -> datetime:
return datetime.now(timezone.utc)
@staticmethod
def from_iso(iso: str) -> datetime:
return datetime.fromisoformat(iso).replace(tzinfo=timezone.utc)
@staticmethod
def ts() -> int:
return int(utc.now().timestamp())
async def gather(gen: AsyncGenerator[T, None]) -> list[T]: async def gather(gen: AsyncGenerator[T, None]) -> list[T]:
items = [] items = []
async for x in gen: async for x in gen:
@ -147,14 +161,6 @@ def to_old_rep(obj: dict) -> dict[str, dict]:
return {"tweets": {**tw1, **tw2}, "users": users} return {"tweets": {**tw1, **tw2}, "users": users}
def utc_ts() -> int:
return int(datetime.utcnow().replace(tzinfo=timezone.utc).timestamp())
def from_utciso(iso: str) -> datetime:
return datetime.fromisoformat(iso).replace(tzinfo=timezone.utc)
def print_table(rows: list[dict], hr_after=False): def print_table(rows: list[dict], hr_after=False):
if not rows: if not rows:
return return