diff --git a/readme.md b/readme.md index ed854ff..1b1b940 100644 --- a/readme.md +++ b/readme.md @@ -99,7 +99,58 @@ if __name__ == "__main__": ## CLI -You can also use the CLI to make requests (before that you need to log in to some accounts through the programming interface). +### Get help on CLI commands + +```sh +# show all commands +twscrape + +# help on specific comand +twscrape search --help +``` + +### Add accounts & login + +First add accounts from file: + +```sh +# twscrape add_accounts +# line_format should have "username", "password", "email", "email_password" tokens +# tokens delimeter should be same as an file +twscrape add_accounts accounts.txt username:password:email:email_password +``` + +The call login: + +```sh +twscrape login_accounts +``` + +Accounts and their sessions will be saved, so they can be reused for future requests + +### Get list of accounts and their statuses + +```sh +twscrape accounts + +# Output: +# ─────────────────────────────────────────────────────────────────────────────────── +# username logged_in active last_used total_req error_msg +# ─────────────────────────────────────────────────────────────────────────────────── +# user1 True True 2023-05-20 03:20:40 100 None +# user2 True True 2023-05-20 03:25:45 120 None +# user3 False False None 120 Login error +``` + +### Use different accounts file + +Useful if using a different set of accounts for different actions + +``` +twscrape --db test-accounts.db +``` + +### Search commands ```sh twscrape search "QUERY" --limit=20 @@ -126,38 +177,6 @@ By default, parsed data is returned. The original tweet responses can be retriev twscrape search "elon mask lang:es" --limit=20 --raw ``` -View a list of commands: - -```sh -# show all commands -twscrape - -# help on specific comand -twscrape search --help -``` - -## Advanced usage - -### Get list of connected accounts and their statuses - -```sh -twscrape accounts - -# Output: -# ─────────────────────────────────────────────────────────────────────────────────── -# username logged_in active last_used total_req error_msg -# ─────────────────────────────────────────────────────────────────────────────────── -# user1 True True 2023-05-20 03:20:40 100 None -# user2 True True 2023-05-20 03:25:45 120 None -# user3 False False None 120 Login error -``` - -Or from code: -```python -pool = AccountsPool() -print(await pool.accounts_info()) # list -``` - ## Limitations API rate limits (per account): diff --git a/twscrape/accounts_pool.py b/twscrape/accounts_pool.py index 4d590d0..8476abd 100644 --- a/twscrape/accounts_pool.py +++ b/twscrape/accounts_pool.py @@ -11,10 +11,36 @@ from .login import login from .utils import utc_ts +def guess_delim(line: str): + l, r = [x.strip() for x in line.split("username")] + return r[0] if not l else l[-1] + + class AccountsPool: def __init__(self, db_file="accounts.db"): self._db_file = db_file + async def load_from_file(self, filepath: str, line_format: str): + assert "username" in line_format, "username is required" + assert "password" in line_format, "password is required" + assert "email" in line_format, "email is required" + assert "email_password" in line_format, "email_password is required" + + line_delim = guess_delim(line_format) + tokens = line_format.split(line_delim) + + with open(filepath, "r") as f: + lines = f.read().split("\n") + lines = [x.strip() for x in lines if x.strip()] + for line in lines: + data = [x.strip() for x in line.split(line_delim)] + if len(data) < len(tokens): + logger.warning(f"Invalid line format: {line}") + continue + + data = data[: len(tokens)] + await self.add_account(**{k: v for k, v in zip(tokens, data)}) + async def add_account( self, username: str, @@ -27,8 +53,11 @@ class AccountsPool: qs = "SELECT * FROM accounts WHERE username = :username" rs = await fetchone(self._db_file, qs, {"username": username}) if rs: + logger.debug(f"Account {username} already exists") return + logger.debug(f"Adding account {username}") + account = Account( username=username, password=password, @@ -69,6 +98,7 @@ class AccountsPool: async def login(self, account: Account): try: await login(account) + logger.info(f"Logged in to {account.username} successfully") except Exception as e: logger.error(f"Error logging in to {account.username}: {e}") finally: diff --git a/twscrape/cli.py b/twscrape/cli.py index af2d5be..d41aad6 100644 --- a/twscrape/cli.py +++ b/twscrape/cli.py @@ -2,6 +2,7 @@ import argparse import asyncio +import io from .api import API, AccountsPool from .logger import logger, set_log_level @@ -10,6 +11,11 @@ from .utils import print_table VER = "0.1.0" +class CustomHelpFormatter(argparse.HelpFormatter): + def __init__(self, prog): + super().__init__(prog, max_help_position=30, width=120) + + def get_fn_arg(args): names = ["query", "tweet_id", "user_id", "username"] for name in names: @@ -24,13 +30,14 @@ async def main(args): if args.debug: set_log_level("DEBUG") - pool = AccountsPool(args.db) - api = API(pool, debug=args.debug) - if args.command == "version": print(VER) return + logger.debug(f"Using database: {args.db}") + pool = AccountsPool(args.db) + api = API(pool, debug=args.debug) + if args.command == "accounts": print_table(await pool.accounts_info()) return @@ -39,6 +46,14 @@ async def main(args): print(await pool.stats()) return + if args.command == "add_accounts": + await pool.load_from_file(args.file_path, args.line_format) + return + + if args.command == "login_accounts": + await pool.login_all() + return + fn = args.command + "_raw" if args.raw else args.command fn = getattr(api, fn, None) if fn is None: @@ -55,8 +70,29 @@ async def main(args): print(doc.json()) +def custom_help(p): + buffer = io.StringIO() + p.print_help(buffer) + msg = buffer.getvalue() + + cmd = msg.split("positional arguments:")[1].strip().split("\n")[0] + msg = msg.replace("positional arguments:", "commands:") + msg = [x for x in msg.split("\n") if not cmd in x and not "..." in x] + msg[0] = f"{msg[0]} [...]" + + i = 0 + for i, line in enumerate(msg): + if line.strip().startswith("search"): + break + + msg.insert(i, "") + msg.insert(i + 1, "search commands:") + + print("\n".join(msg)) + + def run(): - p = argparse.ArgumentParser(add_help=False) + p = argparse.ArgumentParser(add_help=False, formatter_class=CustomHelpFormatter) p.add_argument("--db", default="accounts.db", help="Accounts database file") p.add_argument("--debug", action="store_true", help="Enable debug mode") subparsers = p.add_subparsers(dest="command") @@ -74,7 +110,11 @@ def run(): subparsers.add_parser("version", help="Show version") subparsers.add_parser("accounts", help="List all accounts") - subparsers.add_parser("stats", help="Show scraping statistics") + + add_accounts = subparsers.add_parser("add_accounts", help="Add accounts") + add_accounts.add_argument("file_path", help="File with accounts") + add_accounts.add_argument("line_format", help="args of Pool.add_account splited by same delim") + subparsers.add_parser("login_accounts", help="Login accounts") clim("search", "Search for tweets", "query", "Search query") cone("tweet_details", "Get tweet details", "tweet_id", "Tweet ID", int) @@ -89,7 +129,6 @@ def run(): args = p.parse_args() if args.command is None: - p.print_help() - return + return custom_help(p) asyncio.run(main(args)) diff --git a/twscrape/imap.py b/twscrape/imap.py index ee734b2..3009367 100644 --- a/twscrape/imap.py +++ b/twscrape/imap.py @@ -9,8 +9,35 @@ from .logger import logger MAX_WAIT_SEC = 30 +class EmailLoginError(Exception): + def __init__(self, message="Email login error"): + self.message = message + super().__init__(self.message) + + +class EmailCodeTimeoutError(Exception): + def __init__(self, message="Email code timeout"): + self.message = message + super().__init__(self.message) + + +IMAP_MAPPING: dict[str, str] = { + "yahoo.com": "imap.mail.yahoo.com", + "icloud.com": "imap.mail.me.com", + "outlook.com": "imap-mail.outlook.com", + "hotmail.com": "imap-mail.outlook.com", +} + + +def add_imap_mapping(email_domain: str, imap_domain: str): + IMAP_MAPPING[email_domain] = imap_domain + + def get_imap_domain(email: str) -> str: - return f"imap.{email.split('@')[1]}" + email_domain = email.split("@")[1] + if email_domain in IMAP_MAPPING: + return IMAP_MAPPING[email_domain] + return f"imap.{email_domain}" def search_email_code(imap: imaplib.IMAP4_SSL, count: int, min_t: datetime | None) -> str | None: @@ -39,7 +66,11 @@ async def get_email_code(email: str, password: str, min_t: datetime | None = Non domain = get_imap_domain(email) start_time = time.time() with imaplib.IMAP4_SSL(domain) as imap: - imap.login(email, password) + try: + imap.login(email, password) + except imaplib.IMAP4.error as e: + logger.error(f"Error logging into {email}: {e}") + raise EmailLoginError() from e was_count = 0 while True: @@ -52,5 +83,6 @@ async def get_email_code(email: str, password: str, min_t: datetime | None = Non logger.debug(f"Waiting for confirmation code for {email}, msg_count: {now_count}") if MAX_WAIT_SEC < time.time() - start_time: - raise Exception(f"Timeout on getting confirmation code for {email}") + logger.error(f"Timeout waiting for confirmation code for {email}") + raise EmailCodeTimeoutError() await asyncio.sleep(5) diff --git a/twscrape/login.py b/twscrape/login.py index c9e7212..790d14f 100644 --- a/twscrape/login.py +++ b/twscrape/login.py @@ -176,7 +176,7 @@ async def next_login_task(client: AsyncClient, acc: Account, rep: Response): if task_id == "LoginJsInstrumentationSubtask": return await login_instrumentation(client, acc, prev) except Exception as e: - acc.error_msg = f"task={task_id} err={e}" + acc.error_msg = f"login_step={task_id} err={e}" logger.error(f"Error in {task_id}: {e}") raise e diff --git a/twscrape/utils.py b/twscrape/utils.py index 69831b1..f3ff6fe 100644 --- a/twscrape/utils.py +++ b/twscrape/utils.py @@ -148,8 +148,17 @@ def print_table(rows: list[dict]): if not rows: return + def prt(x): + if isinstance(x, str): + return x + + if isinstance(x, int): + return f"{x:,}" + + return str(x) + keys = list(rows[0].keys()) - rows = [{k: k for k in keys}, *[{k: str(x.get(k, "")) for k in keys} for x in rows]] + rows = [{k: k for k in keys}, *[{k: prt(x.get(k, "")) for k in keys} for x in rows]] colw = [max(len(x[k]) for x in rows) + 1 for k in keys] lines = []