Этот коммит содержится в:
Vlad Pronsky 2023-11-01 20:46:01 +02:00
родитель fd64ce2018
Коммит 6a232da016
17 изменённых файлов: 100 добавлений и 74 удалений

2
.github/workflows/test.yml поставляемый
Просмотреть файл

@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10", "3.11"]
python-version: ["3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4

1
.tool-versions Обычный файл
Просмотреть файл

@ -0,0 +1 @@
python 3.12.0

4
.vscode/settings.json поставляемый
Просмотреть файл

@ -8,7 +8,7 @@
},
"[python]": {
"editor.formatOnSave": true,
"editor.codeActionsOnSave": ["source.organizeImports"]
"editor.codeActionsOnSave": ["source.organizeImports"],
"editor.defaultFormatter": "ms-python.black-formatter"
},
"python.formatting.provider": "black"
}

11
Dockerfile.python Обычный файл
Просмотреть файл

@ -0,0 +1,11 @@
ARG VER=3.12
FROM python:${VER}-alpine
RUN apk add git
WORKDIR /app
COPY pyproject.toml readme.md /app/
RUN pip install -e .[dev]
COPY . /app
CMD python --version; pytest tests/

Просмотреть файл

Просмотреть файл

@ -1,7 +1,5 @@
.PHONY: all build
SQTEST = docker -l warning build -f sqlite.dockerfile
all:
@echo "hi"
@ -38,27 +36,33 @@ show-cov:
@coverage html
@open htmlcov/index.html
act:
@act --container-architecture linux/amd64
changelog:
@git pull origin --tags > /dev/null
@git log $(shell git describe --tags --abbrev=0 HEAD)^..HEAD --pretty=format:'- %s'
test34:
test-py:
$(eval name=twscrape_py$(v))
@docker -l warning build -f Dockerfile.python --build-arg VER=$(v) -t $(name) .
@docker run $(name)
test-sq:
$(eval name=twscrape_sq$(v))
@docker -l warning build -f Dockerfile.sqlite --build-arg SQLY=$(y) --build-arg SQLV=$(v) -t $(name) .
@docker run $(name)
test-py-matrix:
@make test-py v=3.10
@make test-py v=3.11
@make test-py v=3.12
test-sq-matrix:
@# https://www.sqlite.org/chronology.html
@$(SQTEST) --build-arg SQLY=2018 --build-arg SQLV=3240000 -t twscrape_sq24 .
@$(SQTEST) --build-arg SQLY=2019 --build-arg SQLV=3270200 -t twscrape_sq27 .
@$(SQTEST) --build-arg SQLY=2019 --build-arg SQLV=3300100 -t twscrape_sq30 .
@$(SQTEST) --build-arg SQLY=2020 --build-arg SQLV=3330000 -t twscrape_sq33 .
@$(SQTEST) --build-arg SQLY=2021 --build-arg SQLV=3340100 -t twscrape_sq34 .
@$(SQTEST) --build-arg SQLY=2023 --build-arg SQLV=3430000 -t twscrape_sq43 .
@docker run twscrape_sq24
@docker run twscrape_sq27
@docker run twscrape_sq30
@docker run twscrape_sq33
@docker run twscrape_sq34
@docker run twscrape_sq43
@make test-sq y=2018 v=3240000
@make test-sq y=2019 v=3270200
@make test-sq y=2019 v=3300100
@make test-sq y=2020 v=3330000
@make test-sq y=2021 v=3340100
@make test-sq y=2023 v=3430000
update-mocks:
twscrape user_by_id --raw 2244994945 | jq > ./tests/mocked-data/user_by_id_raw.json

Просмотреть файл

@ -16,22 +16,23 @@ classifiers = [
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
'Programming Language :: Python :: 3.12',
]
dependencies = [
"aiosqlite==0.17.0",
"fake-useragent==1.2.1",
"httpx==0.24.0",
"loguru==0.7.0",
"aiosqlite>=0.17.0",
"fake-useragent>=1.3.0",
"httpx>=0.24.0",
"loguru>=0.7.0",
]
[project.optional-dependencies]
dev = [
"pylint==2.17.3",
"pytest-asyncio==0.21.0",
"pytest-cov==4.0.0",
"pytest-httpx==0.22.0",
"pytest==7.3.1",
"ruff==0.0.263",
"pylint>=2.17.3",
"pytest-asyncio>=0.21.0",
"pytest-cov>=4.0.0",
"pytest-httpx>=0.22.0",
"pytest>=7.4.0",
"ruff"
]
[project.urls]
@ -46,13 +47,13 @@ packages = ['twscrape']
[tool.pylint]
max-line-length = 99
disable = [
"C0103", # invalid-name
"C0114", # missing-module-docstring
"C0115", # missing-class-docstring
"C0116", # missing-function-docstring
"R0903", # too-few-public-methods
"R0913", # too-many-arguments
"W0105", # pointless-string-statement
"C0103", # invalid-name
"C0114", # missing-module-docstring
"C0115", # missing-class-docstring
"C0116", # missing-function-docstring
"R0903", # too-few-public-methods
"R0913", # too-many-arguments
"W0105", # pointless-string-statement
]
[tool.pytest.ini_options]
@ -67,3 +68,6 @@ line-length = 99
[tool.ruff]
line-length = 99
[tool.hatch.metadata]
allow-direct-references = true

Просмотреть файл

@ -10,9 +10,9 @@
<a href="https://github.com/vladkens/twscrape/actions">
<img src="https://github.com/vladkens/twscrape/workflows/test/badge.svg" alt="test status" />
</a>
<!-- <a href="https://pypi.org/project/twscrape">
<a href="https://pypi.org/project/twscrape">
<img src="https://badgen.net/pypi/dm/twscrape" alt="downloads" />
</a> -->
</a>
<a href="https://github.com/vladkens/twscrape/blob/main/LICENSE">
<img src="https://badgen.net/github/license/vladkens/twscrape" alt="license" />
</a>

Просмотреть файл

@ -252,6 +252,7 @@ async def test_user_tweets_and_replies():
for doc in tweets:
check_tweet(doc)
async def test_list_timeline():
api = API()
mock_gen(api, "list_timeline_raw")

Просмотреть файл

@ -1,5 +1,5 @@
from twscrape.accounts_pool import AccountsPool
from twscrape.utils import utc_ts
from twscrape.utils import utc
async def test_add_accounts(pool_mock: AccountsPool):
@ -102,7 +102,7 @@ async def test_account_unlock(pool_mock: AccountsPool):
assert acc.locks[Q] is not None
# should update lock time
end_time = utc_ts() + 60 # + 1 minute
end_time = utc.ts() + 60 # + 1 minute
await pool_mock.lock_until(acc.username, Q, end_time)
acc = await pool_mock.get(acc.username)

Просмотреть файл

@ -7,7 +7,7 @@ from httpx import AsyncClient, AsyncHTTPTransport
from .constants import TOKEN
from .models import JSONTrait
from .utils import from_utciso
from .utils import utc
@dataclass
@ -30,12 +30,12 @@ class Account(JSONTrait):
@staticmethod
def from_rs(rs: sqlite3.Row):
doc = dict(rs)
doc["locks"] = {k: from_utciso(v) for k, v in json.loads(doc["locks"]).items()}
doc["locks"] = {k: utc.from_iso(v) for k, v in json.loads(doc["locks"]).items()}
doc["stats"] = {k: v for k, v in json.loads(doc["stats"]).items() if isinstance(v, int)}
doc["headers"] = json.loads(doc["headers"])
doc["cookies"] = json.loads(doc["cookies"])
doc["active"] = bool(doc["active"])
doc["last_used"] = from_utciso(doc["last_used"]) if doc["last_used"] else None
doc["last_used"] = utc.from_iso(doc["last_used"]) if doc["last_used"] else None
return Account(**doc)
def to_rs(self):

Просмотреть файл

@ -11,7 +11,7 @@ from .account import Account
from .db import execute, fetchall, fetchone
from .logger import logger
from .login import login
from .utils import parse_cookies, utc_ts
from .utils import parse_cookies, utc
class AccountInfo(TypedDict):
@ -197,7 +197,7 @@ class AccountsPool:
UPDATE accounts SET
locks = json_set(locks, '$.{queue}', datetime({unlock_at}, 'unixepoch')),
stats = json_set(stats, '$.{queue}', COALESCE(json_extract(stats, '$.{queue}'), 0) + {req_count}),
last_used = datetime({utc_ts()}, 'unixepoch')
last_used = datetime({utc.ts()}, 'unixepoch')
WHERE username = :username
"""
await execute(self._db_file, qs, {"username": username})
@ -207,7 +207,7 @@ class AccountsPool:
UPDATE accounts SET
locks = json_remove(locks, '$.{queue}'),
stats = json_set(stats, '$.{queue}', COALESCE(json_extract(stats, '$.{queue}'), 0) + {req_count}),
last_used = datetime({utc_ts()}, 'unixepoch')
last_used = datetime({utc.ts()}, 'unixepoch')
WHERE username = :username
"""
await execute(self._db_file, qs, {"username": username})
@ -228,7 +228,7 @@ class AccountsPool:
qs = f"""
UPDATE accounts SET
locks = json_set(locks, '$.{queue}', datetime('now', '+15 minutes')),
last_used = datetime({utc_ts()}, 'unixepoch')
last_used = datetime({utc.ts()}, 'unixepoch')
WHERE username = ({q1})
RETURNING *
"""
@ -238,7 +238,7 @@ class AccountsPool:
qs = f"""
UPDATE accounts SET
locks = json_set(locks, '$.{queue}', datetime('now', '+15 minutes')),
last_used = datetime({utc_ts()}, 'unixepoch'),
last_used = datetime({utc.ts()}, 'unixepoch'),
_tx = '{tx}'
WHERE username = ({q1})
"""
@ -277,8 +277,7 @@ class AccountsPool:
"""
rs = await fetchone(self._db_file, qs)
if rs:
now = datetime.utcnow().replace(tzinfo=timezone.utc)
trg = datetime.fromisoformat(rs[0]).replace(tzinfo=timezone.utc)
now, trg = utc.now(), utc.from_iso(rs[0])
if trg < now:
return "now"

Просмотреть файл

@ -109,7 +109,7 @@ class API:
"hidden_profile_likes_enabled": True,
"highlights_tweets_tab_ui_enabled": True,
"creator_subscriptions_tweet_preview_api_enabled": True,
"hidden_profile_subscriptions_enabled": True
"hidden_profile_subscriptions_enabled": True,
}
return await self._gql_item(op, kv, ft)
@ -128,7 +128,7 @@ class API:
"creator_subscriptions_tweet_preview_api_enabled": True,
"subscriptions_verification_info_verified_since_enabled": True,
"hidden_profile_subscriptions_enabled": True,
"subscriptions_verification_info_is_identity_verified_enabled": False
"subscriptions_verification_info_is_identity_verified_enabled": False,
}
return await self._gql_item(op, kv, ft)

Просмотреть файл

@ -1,4 +1,4 @@
from datetime import datetime, timedelta, timezone
from datetime import timedelta
from httpx import AsyncClient, HTTPStatusError, Response
@ -6,7 +6,7 @@ from .account import Account
from .constants import LOGIN_URL
from .imap import imap_get_email_code, imap_login
from .logger import logger
from .utils import raise_for_status
from .utils import raise_for_status, utc
async def get_guest_token(client: AsyncClient):
@ -120,7 +120,7 @@ async def login_confirm_email_code(client: AsyncClient, acc: Account, prev: dict
if not imap:
imap = await imap_login(acc.email, acc.email_password)
now_time = datetime.now(timezone.utc) - timedelta(seconds=30)
now_time = utc.now() - timedelta(seconds=30)
value = await imap_get_email_code(imap, acc.email, now_time)
payload = {

Просмотреть файл

@ -12,7 +12,7 @@ from typing import Generator, Optional
import httpx
from .logger import logger
from .utils import find_item, get_or, int_or, to_old_rep
from .utils import find_item, get_or, int_or, to_old_rep, utc
@dataclass
@ -407,7 +407,7 @@ def _get_views(obj: dict, rt_obj: dict):
def _write_dump(kind: str, e: Exception, x: dict, obj: dict):
uniq = "".join(random.choice(string.ascii_lowercase) for _ in range(5))
time = datetime.utcnow().strftime("%Y-%m-%d_%H-%M-%S")
time = utc.now().strftime("%Y-%m-%d_%H-%M-%S")
dumpfile = f"/tmp/twscrape/twscrape_parse_error_{time}_{uniq}.txt"
os.makedirs(os.path.dirname(dumpfile), exist_ok=True)

Просмотреть файл

@ -1,16 +1,15 @@
import json
import os
from datetime import datetime
from typing import Any
import httpx
from .accounts_pool import Account, AccountsPool
from .logger import logger
from .utils import utc_ts
from .utils import utc
ReqParams = dict[str, str | int] | None
TMP_TS = datetime.utcnow().isoformat().split(".")[0].replace("T", "_").replace(":", "-")[0:16]
TMP_TS = utc.now().isoformat().split(".")[0].replace("T", "_").replace(":", "-")[0:16]
class Ctx:
@ -39,6 +38,7 @@ class RateLimitError(Exception):
class BannedError(Exception):
pass
class DependencyError(Exception):
pass
@ -151,7 +151,7 @@ class QueueClient:
# possible new limits for tweets view per account
if msg.startswith("(88) Rate limit exceeded") or rep.status_code == 429:
await self._close_ctx(utc_ts() + 60 * 60 * 4) # lock for 4 hours
await self._close_ctx(utc.ts() + 60 * 60 * 4) # lock for 4 hours
raise RateLimitError(msg)
if msg.startswith("(326) Authorization: Denied by access control"):
@ -163,7 +163,7 @@ class QueueClient:
# possible banned by old api flow
if rep.status_code in (401, 403):
await self._close_ctx(utc_ts() + 60 * 60 * 12) # lock for 12 hours
await self._close_ctx(utc.ts() + 60 * 60 * 12) # lock for 12 hours
raise RateLimitError(msg)
# content not found
@ -196,7 +196,7 @@ class QueueClient:
except (RateLimitError, BannedError):
# already handled
continue
except (DependencyError):
except DependencyError:
logger.error(f"Dependency error, returnning: {url}")
return
except (httpx.ReadTimeout, httpx.ProxyError):
@ -206,4 +206,4 @@ class QueueClient:
retry_count += 1
if retry_count >= 3:
logger.warning(f"Unknown error {type(e)}: {e}")
await self._close_ctx(utc_ts() + 60 * 15) # 15 minutes
await self._close_ctx(utc.ts() + 60 * 15) # 15 minutes

Просмотреть файл

@ -11,6 +11,20 @@ from .logger import logger
T = TypeVar("T")
class utc:
@staticmethod
def now() -> datetime:
return datetime.now(timezone.utc)
@staticmethod
def from_iso(iso: str) -> datetime:
return datetime.fromisoformat(iso).replace(tzinfo=timezone.utc)
@staticmethod
def ts() -> int:
return int(utc.now().timestamp())
async def gather(gen: AsyncGenerator[T, None]) -> list[T]:
items = []
async for x in gen:
@ -147,14 +161,6 @@ def to_old_rep(obj: dict) -> dict[str, dict]:
return {"tweets": {**tw1, **tw2}, "users": users}
def utc_ts() -> int:
return int(datetime.utcnow().replace(tzinfo=timezone.utc).timestamp())
def from_utciso(iso: str) -> datetime:
return datetime.fromisoformat(iso).replace(tzinfo=timezone.utc)
def print_table(rows: list[dict], hr_after=False):
if not rows:
return