From aa69c653ef7eda9a75bfd280089ecffcb0a871c1 Mon Sep 17 00:00:00 2001 From: Vlad Pronsky Date: Sun, 30 Jul 2023 17:24:07 +0300 Subject: [PATCH] restore full tweets text in case of retweet #42 --- examples/parallel_search_with_limit.py | 4 ++-- tests/test_parser.py | 7 ++++--- twscrape/models.py | 26 +++++++++++++++----------- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/examples/parallel_search_with_limit.py b/examples/parallel_search_with_limit.py index cbddefb..64fefc7 100644 --- a/examples/parallel_search_with_limit.py +++ b/examples/parallel_search_with_limit.py @@ -24,8 +24,8 @@ async def worker(queue: asyncio.Queue, api: twscrape.API): async def main(): api = twscrape.API() # add accounts here or before from cli (see README.md for examples) - # await api.pool.add_account("u1", "p1", "eu1", "ep1") - # await api.pool.login_all() + await api.pool.add_account("u1", "p1", "eu1", "ep1") + await api.pool.login_all() queries = ["elon musk", "tesla", "spacex", "neuralink", "boring company"] diff --git a/tests/test_parser.py b/tests/test_parser.py index 9ec6303..74225ce 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -99,6 +99,9 @@ def check_tweet(doc: Tweet | None): assert v.bitrate is not None assert v.contentType is not None + if doc.retweetedTweet is not None: + assert doc.rawContent.endswith(doc.retweetedTweet.rawContent), "content should be full" + check_user(doc.user) @@ -306,6 +309,4 @@ async def test_issue_42(): assert doc.rawContent is not None assert doc.retweetedTweet.rawContent is not None - msg = "Today marks the arrival of a traditional Chinese solar term called mangzhong, or Grain in Ear, signifying a busy farming period. https://t.co/SQMrX99bWr" - assert doc.retweetedTweet.rawContent == msg - # assert doc.rawContent == msg # todo: not sure should it be populated from rt + assert doc.rawContent.endswith(doc.retweetedTweet.rawContent) diff --git a/twscrape/models.py b/twscrape/models.py index 0ceaafe..239f6af 100644 --- a/twscrape/models.py +++ b/twscrape/models.py @@ -195,14 +195,7 @@ class Tweet(JSONTrait): qt_id = _first(obj, ["quoted_status_id_str", "quoted_status_result.result.rest_id"]) qt_obj = get_or(res, f"tweets.{qt_id}") - # for development - # print() - # print("-" * 80) - # print(res["tweets"].keys()) - # print(rt_id, rt_obj is not None) - # print(qt_id, qt_obj is not None) - - return Tweet( + doc = Tweet( id=int(obj["id_str"]), id_str=obj["id_str"], url=f'https://twitter.com/{tw_usr.username}/status/{obj["id_str"]}', @@ -232,6 +225,19 @@ class Tweet(JSONTrait): media=Media.parse(obj), ) + # issue #42 – restore full rt text + rt = doc.retweetedTweet + if rt is not None and rt.user is not None and doc.rawContent.endswith("…"): + prefix = f"RT @{rt.user.username}: " + rt_msg = f"{prefix}{rt.rawContent}" + if doc.rawContent != rt_msg and doc.rawContent.startswith(prefix): + # was = doc.rawContent.replace("\n", "") + # now = rt_msg.replace("\n", "") + # print(f"\n{was}\n{now}\n") + doc.rawContent = rt_msg + + return doc + @dataclass class MediaPhoto(JSONTrait): @@ -239,9 +245,7 @@ class MediaPhoto(JSONTrait): @staticmethod def parse(obj: dict): - return MediaPhoto( - url=obj["media_url_https"], - ) + return MediaPhoto(url=obj["media_url_https"]) @dataclass