Merge branch 'main' into add_image_summary

2025-10-30 21:46:04 +02:00 · 2023-02-14 04:10:31 -08:00 · 2023-02-14 04:10:31 -08:00 · f787164572
--- a/misinformation/test/data/test_data_out.csv
+++ b/misinformation/test/data/test_data_out.csv
@ -0,0 +1,52 @@
+,filename,text,text_language,text_english
+0,./test/data/IMG_3755.jpg,,,"Mathematische Formelsammlung
+für Ingenieure und Naturwissenschaftler
+Mit zahlreichen Abbildungen und Rechenbeispielen
+und einer ausführlichen Integraltafel
+3., verbesserte Auflage"
+1,./test/data/IMG_3756.jpg,"SCATTERING THEORY
+The Quantum Theory of
+Nonrelativistic Collisions
+JOHN R. TAYLOR
+University of Colorado
+ostaliga Lanbidean
+1 ilde
+ballenger stor goin
+gdĐOL, SIVI 23 TL 02
+de in obl
+och yd badalang
+a
+Ber
+ook Sy-RW enot go baldus",om,"SCATTERING THEORY
+The Quantum Theory of
+Nonrelativistic Collisions
+JOHN R. TAYLOR
+University of Colorado
+ostaliga Lanbidean
+1 ilde
+balloons big goin
+gdĐOL, SIVI 23 TL
+there in obl
+och yd change
+a
+Ber
+ook Sy-RW isn't going anywhere"
+2,./test/data/IMG_3757.jpg,"THE
+ALGEBRAIC
+EIGENVALUE
+PROBLEM
+DOM
+NVS TIO
+MINA
+Monographs
+on Numerical Analysis
+J.. H. WILKINSON",en,"THE
+ALGEBRAIC
+EIGENVALUE
+PROBLEM
+DOM
+NVS TIO
+MINA
+Monographs
+on Numerical Analysis
+J.. H. WILKINSON"
--- a/misinformation/test/data/test_data_out_nokey.csv
+++ b/misinformation/test/data/test_data_out_nokey.csv
@ -0,0 +1,52 @@
+,filename,text,text_language,text_nglish
+0,./test/data/IMG_3755.jpg,,,"Mathematische Formelsammlung
+für Ingenieure und Naturwissenschaftler
+Mit zahlreichen Abbildungen und Rechenbeispielen
+und einer ausführlichen Integraltafel
+3., verbesserte Auflage"
+1,./test/data/IMG_3756.jpg,"SCATTERING THEORY
+The Quantum Theory of
+Nonrelativistic Collisions
+JOHN R. TAYLOR
+University of Colorado
+ostaliga Lanbidean
+1 ilde
+ballenger stor goin
+gdĐOL, SIVI 23 TL 02
+de in obl
+och yd badalang
+a
+Ber
+ook Sy-RW enot go baldus",om,"SCATTERING THEORY
+The Quantum Theory of
+Nonrelativistic Collisions
+JOHN R. TAYLOR
+University of Colorado
+ostaliga Lanbidean
+1 ilde
+balloons big goin
+gdĐOL, SIVI 23 TL
+there in obl
+och yd change
+a
+Ber
+ook Sy-RW isn't going anywhere"
+2,./test/data/IMG_3757.jpg,"THE
+ALGEBRAIC
+EIGENVALUE
+PROBLEM
+DOM
+NVS TIO
+MINA
+Monographs
+on Numerical Analysis
+J.. H. WILKINSON",en,"THE
+ALGEBRAIC
+EIGENVALUE
+PROBLEM
+DOM
+NVS TIO
+MINA
+Monographs
+on Numerical Analysis
+J.. H. WILKINSON"
--- a/misinformation/test/data/topic_analysis_test.csv
+++ b/misinformation/test/data/topic_analysis_test.csv
@ -0,0 +1,190 @@
+text_english
+Mercury: Retrograde
+Pathology
+Symbiote
+ProductOfDrugs (Prod. The Virus and Antidote)
+Venom
+Gatteka
+kamikaze (+ pulse)
+T.R.U. (Totally Rotten Underground)
+I Put My Dick in Your Mental
+Andromeda
+BRAINFOOD
+Troll Under the Bridge
+1000 Rounds
+Sacrifice
+Backpack
+D(R)Own
+"Okay
+TakingOutTheTrash
+Io sono qui
+Paris
+Murder
+High 'N Mighty
+Euronymous
+Hades
+Nails
+Squeeze
+No Teeth
+Bang Ya Fucking Head
+BLUE JUICE
+Loch Ness
+Hold Uh
+Bone Saw
+Coffin Wave
+OhNo!
+TheArtOfCremation
+OakGroveRoad
+WhatWasThat
+FunnyToSeeYouHere
+John Dee
+Kybalion
+Killer
+608
+Eternal Dreams
+Nightmare Choir (I Been Asleep Too Long)
+Exodus
+Vengeance
+Claustrophobia
+Rearranged
+Paralax
+Exsanguination
+Mutiny
+Centipede
+Грустная сука
+This World Is Sick
+Пламя
+2:45
+who is he
+Sleeping
+Timeless
+Pound for Pound
+Finger Trembling
+Overload
+Kill Yourself (Part III)
+2nd Hand
+Antarctica
+Memoirs Of A Gorilla
+Runnin' Thru The 7th With My Woadies
+Mount Sinai
+FUCKTHEPOPULATION
+Magazine
+2 Hot 4 U (feat. $Uicdeboy$)
+O Pana!
+LTE
+Champion Of Death
+Seppuku (feat. Suicideboy$ & Jgrxxn)
+You're Now Tuning Into 66.6 FM With DJ Rapture (The Hottest Hour Of The Evening)
+Slip On A Banana Clip
+A Death In The Ocean Would Be Beautiful
+Shattered Amethyst
+Goosebumps
+Venom
+Bury Me
+Hack Slash
+2000 Rounds
+Sea Sick
+Grain
+"Beware
+Kali Yuga
+Hexada
+Caligula
+Niagara (feat. Lil Peep)
+Scrying Through Shattered Glass
+Polaris
+Rapture
+Blackmage
+Tartarus
+Until the Light Takes Us
+As Above so Look out Below
+Swan
+Sneak Diss (feat. So6ix)
+Plague Doctor Mask
+Some of Us May Never See the World
+Filth
+Homecoming
+Blood
+Sweat
+Tears
+Anabolic
+HDMI
+Dirt
+Oxygen
+Branches
+CtrlAltDelete
+BlastZone (ЗонаПоражения)
+CharacterSelect (ВыборПерсонажа)
+RestInPeace (Prod. by The Virus And Antidote)
+BlackMold
+Toxin
+Electric
+Cranium
+Friday
+Hooky
+Kalaxian Crystals
+Slurp
+BROKE ft. Prohibeo
+Lies
+Terry McGinnis
+Gremlin
+Giant Squit
+You Are Not Like Us
+Arachnids
+Give Ah Fuck
+Death Wish
+Allergies
+Cut Throat
+Memoirs of a Gorilla
+Benz Truck (гелик)
+Norf Norf
+Dat $tick
+"RAF (feat. A$AP Rocky
+Crazy
+Still Cold / Pathway Private
+The Chills
+Slip on a Banana Clip
+Lights
+Akina Speed Star
+Big Fish
+The Bodies Fall Just Like the Leaves
+Story: No Title
+P.S Fuck You Cunt (feat. Lil Peep)
+Torch
+"Buff Squad (feat. Pouya
+Sarcophagus III (feat. $Uicideboy$)
+Virginia Tech
+Lte
+Fuckthepopulation
+Gloss of Blood
+100K
+Dark Light
+"But Wait
+Great Influence
+It Don't Matter
+absolute in doubt
+Boss
+Look at Me Now
+Bulletproof
+Contraband
+Deira City Centre
+Kyoto
+Pull Out Game
+Bird Is The Word
+Life Is Short
+Here We Go Again
+Bloodshed
+Wassup Bro!
+ACT 2 - BirthOfTheSpaceGod
+Grey Tee
+Sleeping Bag
+Afterlife
+King Cobra (Drippin')
+Heart Attack
+Chain$Aw
+"King
+P.T.S.D
+Brand New
+Jukai
+Philosopher's Throne
+PRBLMS
+Back At It
--- a/misinformation/test/test_text.py
+++ b/misinformation/test/test_text.py
@ -2,6 +2,8 @@ import os
 import pytest
 import spacy
 import misinformation.text as tt
+import misinformation
+import pandas as pd

 TESTDICT = {
    "IMG_3755": {
@ -29,7 +31,6 @@ def test_TextDetector():
        assert test_obj.subdict["text_language"] is None
        assert test_obj.subdict["text_english"] is None
        assert not test_obj.analyse_text
-        assert not test_obj.analyse_topic


@pytest.mark.gcv
@ -39,7 +40,6 @@ def test_analyse_image():
        test_obj.analyse_image()
        test_obj = tt.TextDetector(TESTDICT[item], analyse_text=True)
        test_obj.analyse_image()
-        test_obj = tt.TextDetector(TESTDICT[item], analyse_topic=True)


@pytest.mark.gcv
@ -68,6 +68,15 @@ def test_translate_text():
        assert test_obj.subdict["text_english"] == translated_text


+def test_remove_linebreaks():
+    test_obj = tt.TextDetector({})
+    test_obj.subdict["text"] = "This is \n a test."
+    test_obj.subdict["text_english"] = "This is \n another\n test."
+    test_obj.remove_linebreaks()
+    assert test_obj.subdict["text"] == "This is   a test."
+    assert test_obj.subdict["text_english"] == "This is   another  test."
+
+
 def test_run_spacy():
    test_obj = tt.TextDetector(TESTDICT["IMG_3755"], analyse_text=True)
    ref_file = "./test/data/text_IMG_3755.txt"
@ -106,3 +115,34 @@ def test_sentiment_analysis():
    test_obj.sentiment_analysis()
    assert test_obj.subdict["polarity"] == 0.5
    assert test_obj.subdict["subjectivity"] == 0.6
+
+
+def test_PostprocessText():
+    reference_dict = "THE\nALGEBRAIC\nEIGENVALUE\nPROBLEM\nDOM\nNVS TIO\nMINA\nMonographs\non Numerical Analysis\nJ.. H. WILKINSON"
+    reference_df = "Mathematische Formelsammlung\nfür Ingenieure und Naturwissenschaftler\nMit zahlreichen Abbildungen und Rechenbeispielen\nund einer ausführlichen Integraltafel\n3., verbesserte Auflage"
+    obj = tt.PostprocessText(mydict=TESTDICT)
+    # make sure test works on windows where end-of-line character is \r\n
+    test_dict = obj.list_text_english[2].replace("\r", "")
+    assert test_dict == reference_dict
+    for key in TESTDICT.keys():
+        TESTDICT[key].pop("text_english")
+    with pytest.raises(ValueError):
+        tt.PostprocessText(mydict=TESTDICT)
+    obj = tt.PostprocessText(use_csv=True, csv_path="./test/data/test_data_out.csv")
+    # make sure test works on windows where end-of-line character is \r\n
+    test_df = obj.list_text_english[0].replace("\r", "")
+    assert test_df == reference_df
+    with pytest.raises(ValueError):
+        tt.PostprocessText(use_csv=True, csv_path="./test/data/test_data_out_nokey.csv")
+    with pytest.raises(ValueError):
+        tt.PostprocessText()
+
+
+def test_analyse_topic():
+    _, topic_df, most_frequent_topics = tt.PostprocessText(
+        use_csv=True, csv_path="./test/data/topic_analysis_test.csv"
+    ).analyse_topic()
+    # since this is not deterministic we cannot be sure we get the same result twice
+    assert len(topic_df) == 2
+    assert topic_df["Name"].iloc[0] == "0_the_feat_of_is"
+    assert most_frequent_topics[0][0][0] == "the"
--- a/misinformation/text.py
+++ b/misinformation/text.py
@ -6,6 +6,9 @@ from textblob import TextBlob
 from textblob import download_corpora
 import io
 from misinformation import utils
+import grpc
+import pandas as pd
+from bertopic import BERTopic

 # make widgets work again
 # clean text has weird spaces and separation of "do n't"
@ -13,14 +16,11 @@ from misinformation import utils


 class TextDetector(utils.AnalysisMethod):
-    def __init__(
-        self, subdict: dict, analyse_text: bool = False, analyse_topic: bool = False
-    ) -> None:
+    def __init__(self, subdict: dict, analyse_text: bool = False) -> None:
        super().__init__(subdict)
        self.subdict.update(self.set_keys())
        self.translator = Translator()
        self.analyse_text = analyse_text
-        self.analyse_topic = analyse_topic
        if self.analyse_text:
            self._initialize_spacy()
            self._initialize_textblob()
@ -46,13 +46,12 @@ class TextDetector(utils.AnalysisMethod):
    def analyse_image(self):
        self.get_text_from_image()
        self.translate_text()
+        self.remove_linebreaks()
        if self.analyse_text:
            self._run_spacy()
            self.clean_text()
            self.correct_spelling()
            self.sentiment_analysis()
-        if self.analyse_topic:
-            self.analyse_topic()
        return self.subdict

    def get_text_from_image(self):
@ -62,12 +61,19 @@ class TextDetector(utils.AnalysisMethod):
        with io.open(path, "rb") as image_file:
            content = image_file.read()
        image = vision.Image(content=content)
-        response = client.text_detection(image=image)
-        texts = response.text_annotations[0].description
-        # here check if text was found
-        if texts:
+        # check for usual connection errors and retry if necessary
+        try:
+            response = client.text_detection(image=image)
+        except grpc.RpcError as exc:
+            print("Cloud vision API connection failed")
+            print("Skipping this image ..{}".format(path))
+            print("Connection failed with code {}: {}".format(exc.code(), exc))
+        # here check if text was found on image
+        if response:
+            texts = response.text_annotations[0].description
            self.subdict["text"] = texts
        if response.error.message:
+            print("Google Cloud Vision Error")
            raise ValueError(
                "{}\nFor more info on error messages, check: "
                "https://cloud.google.com/apis/design/errors".format(
@ -80,6 +86,14 @@ class TextDetector(utils.AnalysisMethod):
        self.subdict["text_language"] = translated.src
        self.subdict["text_english"] = translated.text

+    def remove_linebreaks(self):
+        """Remove linebreaks from original and translated text."""
+        if self.subdict["text"]:
+            self.subdict["text"] = self.subdict["text"].replace("\n", " ")
+            self.subdict["text_english"] = self.subdict["text_english"].replace(
+                "\n", " "
+            )
+
    def _run_spacy(self):
        """Generate spacy doc object."""
        self.doc = self.nlp(self.subdict["text_english"])
@ -105,5 +119,73 @@ class TextDetector(utils.AnalysisMethod):
        # where 0.0 is very objective and 1.0 is very subjective
        self.subdict["subjectivity"] = self.doc._.blob.subjectivity

-    def analyse_topic(self):
-        pass
+
+class PostprocessText:
+    def __init__(
+        self, mydict: dict = None, use_csv: bool = False, csv_path: str = None
+    ) -> None:
+        self.use_csv = use_csv
+        if mydict:
+            print("Reading data from dict.")
+            self.mydict = mydict
+            self.list_text_english = self.get_text_dict()
+        elif self.use_csv:
+            print("Reading data from df.")
+            self.df = pd.read_csv(csv_path, encoding="utf8")
+            self.list_text_english = self.get_text_df()
+        else:
+            raise ValueError(
+                "Please provide either dictionary with textual data or \
+                              a csv file by setting `use_csv` to True and providing a \
+                             `csv_path`."
+            )
+
+    def analyse_topic(self, return_topics: int = 3):
+        """Topic analysis using BERTopic."""
+        # load spacy pipeline
+        nlp = spacy.load(
+            "en_core_web_md",
+            exclude=["tagger", "parser", "ner", "attribute_ruler", "lemmatizer"],
+        )
+        try:
+            # unfortunately catching exceptions does not work here - need to figure out why
+            self.topic_model = BERTopic(embedding_model=nlp)
+        except TypeError:
+            print("BERTopic excited with an error - maybe your dataset is too small?")
+        self.topics, self.probs = self.topic_model.fit_transform(self.list_text_english)
+        # return the topic list
+        topic_df = self.topic_model.get_topic_info()
+        # return the most frequent return_topics
+        most_frequent_topics = []
+        if len(topic_df) < return_topics:
+            print("You requested more topics than are identified in your dataset -")
+            print(
+                "Returning only {} topics as these are all that have been found.".format(
+                    len(topic_df)
+                )
+            )
+        for i in range(min(return_topics, len(topic_df))):
+            most_frequent_topics.append(self.topic_model.get_topic(i))
+        return self.topic_model, topic_df, most_frequent_topics
+
+    def get_text_dict(self):
+        # use dict to put text_english in list
+        list_text_english = []
+        for key in self.mydict.keys():
+            if "text_english" not in self.mydict[key]:
+                raise ValueError(
+                    "Please check your provided dictionary - \
+                no english text data found."
+                )
+            list_text_english.append(self.mydict[key]["text_english"])
+        return list_text_english
+
+    def get_text_df(self):
+        # use csv file to obtain dataframe and put text_english in list
+        # check that "text_english" is there
+        if "text_english" not in self.df:
+            raise ValueError(
+                "Please check your provided dataframe - \
+                                no english text data found."
+            )
+        return self.df["text_english"].tolist()
--- a/notebooks/get-text-from-image.ipynb
+++ b/notebooks/get-text-from-image.ipynb
@ -42,7 +42,18 @@
    "import os\n",
    "from IPython.display import Image, display\n",
    "import misinformation\n",
+    "import tensorflow as tf\n",
    "\n",
+    "print(tf.config.list_physical_devices(\"GPU\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27675810",
+   "metadata": {},
+   "outputs": [],
+   "source": [
    "# download the models if they are not there yet\n",
    "!python -m spacy download en_core_web_md\n",
    "!python -m textblob.download_corpora"
@ -55,9 +66,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "images = misinformation.find_files(\n",
-    "    path=\"drive/MyDrive/misinformation-data/\", limit=1000\n",
-    ")"
+    "images = misinformation.find_files(path=\"../data/all/\", limit=1000)"
   ]
  },
  {
@ -78,7 +87,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "mydict = misinformation.utils.initialize_dict(images[0:10])"
+    "mydict = misinformation.utils.initialize_dict(images[0:3])"
   ]
  },
  {
@ -99,7 +108,7 @@
   "source": [
    "os.environ[\n",
    "    \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
-    "] = \"drive/MyDrive/misinformation-data/misinformation-campaign-981aa55a3b13.json\""
+    "] = \"../data/misinformation-campaign-981aa55a3b13.json\""
   ]
  },
  {
@ -180,13 +189,143 @@
   "outputs": [],
   "source": [
    "# Write the csv\n",
-    "df.to_csv(\"drive/MyDrive/misinformation-data/data_out.csv\")"
+    "df.to_csv(\"./data_out.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4bc8ac0a",
+   "metadata": {},
+   "source": [
+    "# Topic analysis\n",
+    "The topic analysis is carried out using [BERTopic](https://maartengr.github.io/BERTopic/index.html) using an embedded model through a [spaCy](https://spacy.io/) pipeline."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4931941b",
+   "metadata": {},
+   "source": [
+    "BERTopic takes a list of strings as input. The more items in the list, the better for the topic modeling. If the below returns an error for `analyse_topic()`, the reason can be that your dataset is too small.\n",
+    "### Option 1: Use the dictionary as obtained from the above analysis."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "568537df",
+   "id": "a3450a61",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make a list of all the text_english entries per analysed image from the mydict variable as above\n",
+    "topic_model, topic_df, most_frequent_topics = misinformation.text.PostprocessText(\n",
+    "    mydict=mydict\n",
+    ").analyse_topic()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "95667342",
+   "metadata": {},
+   "source": [
+    "### Option 2: Read in a csv\n",
+    "Not to analyse too many images on google Cloud Vision, use the csv output to obtain the text (when rerunning already analysed images)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5530e436",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_file_path = \"data_out.csv\"\n",
+    "topic_model, topic_df, most_frequent_topics = misinformation.text.PostprocessText(\n",
+    "    use_csv=True, csv_path=input_file_path\n",
+    ").analyse_topic(return_topics=10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0b6ef6d7",
+   "metadata": {},
+   "source": [
+    "### Access frequent topics\n",
+    "A topic of `-1` stands for an outlier and should be ignored. Topic count is the number of occurence of that topic. The output is structured from most frequent to least frequent topic."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43288cda-61bb-4ff1-a209-dcfcc4916b1f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(topic_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b3316770",
+   "metadata": {},
+   "source": [
+    "### Get information for specific topic\n",
+    "The most frequent topics can be accessed through `most_frequent_topics` with the most occuring topics first in the list."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "db14fe03",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for topic in most_frequent_topics:\n",
+    "    print(\"Topic:\", topic)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d10f701e",
+   "metadata": {},
+   "source": [
+    "### Topic visualization\n",
+    "The topics can also be visualized. Careful: This only works if there is sufficient data (quantity and quality)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2331afe6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "topic_model.visualize_topics()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f4eaf353",
+   "metadata": {},
+   "source": [
+    "### Save the model\n",
+    "The model can be saved for future use."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5e8377c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "topic_model.save(\"misinfo_posts\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c94edb9",
   "metadata": {},
   "outputs": [],
   "source": []
@ -194,7 +333,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "misinf",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -208,7 +347,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.6 (main, Oct 24 2022, 16:07:47) [GCC 11.2.0]"
+   "version": "3.10.6"
  },
  "vscode": {
   "interpreter": {
--- a/pyproject.toml
+++ b/pyproject.toml
@ -48,6 +48,8 @@ dependencies = [
    "textblob",
    "torch",
    "salesforce-lavis @ git+https://github.com/salesforce/LAVIS.git@main"
+    "bertopic",
+    "grpcio",
 ]

 [project.scripts]
--- a/requirements.txt
+++ b/requirements.txt
@ -24,3 +24,5 @@ spacytextblob
 textblob
 git+https://github.com/sloria/TextBlob.git@dev
 git+https://github.com/salesforce/LAVIS.git@main
+bertopic
+grpcio