diff --git a/misinformation/test/data/test_data_out.csv b/misinformation/test/data/test_data_out.csv new file mode 100644 index 0000000..ea08364 --- /dev/null +++ b/misinformation/test/data/test_data_out.csv @@ -0,0 +1,52 @@ +,filename,text,text_language,text_english +0,./test/data/IMG_3755.jpg,,,"Mathematische Formelsammlung +für Ingenieure und Naturwissenschaftler +Mit zahlreichen Abbildungen und Rechenbeispielen +und einer ausführlichen Integraltafel +3., verbesserte Auflage" +1,./test/data/IMG_3756.jpg,"SCATTERING THEORY +The Quantum Theory of +Nonrelativistic Collisions +JOHN R. TAYLOR +University of Colorado +ostaliga Lanbidean +1 ilde +ballenger stor goin +gdĐOL, SIVI 23 TL 02 +de in obl +och yd badalang +a +Ber +ook Sy-RW enot go baldus",om,"SCATTERING THEORY +The Quantum Theory of +Nonrelativistic Collisions +JOHN R. TAYLOR +University of Colorado +ostaliga Lanbidean +1 ilde +balloons big goin +gdĐOL, SIVI 23 TL +there in obl +och yd change +a +Ber +ook Sy-RW isn't going anywhere" +2,./test/data/IMG_3757.jpg,"THE +ALGEBRAIC +EIGENVALUE +PROBLEM +DOM +NVS TIO +MINA +Monographs +on Numerical Analysis +J.. H. WILKINSON",en,"THE +ALGEBRAIC +EIGENVALUE +PROBLEM +DOM +NVS TIO +MINA +Monographs +on Numerical Analysis +J.. H. WILKINSON" diff --git a/misinformation/test/data/test_data_out_nokey.csv b/misinformation/test/data/test_data_out_nokey.csv new file mode 100644 index 0000000..1fc1d11 --- /dev/null +++ b/misinformation/test/data/test_data_out_nokey.csv @@ -0,0 +1,52 @@ +,filename,text,text_language,text_nglish +0,./test/data/IMG_3755.jpg,,,"Mathematische Formelsammlung +für Ingenieure und Naturwissenschaftler +Mit zahlreichen Abbildungen und Rechenbeispielen +und einer ausführlichen Integraltafel +3., verbesserte Auflage" +1,./test/data/IMG_3756.jpg,"SCATTERING THEORY +The Quantum Theory of +Nonrelativistic Collisions +JOHN R. TAYLOR +University of Colorado +ostaliga Lanbidean +1 ilde +ballenger stor goin +gdĐOL, SIVI 23 TL 02 +de in obl +och yd badalang +a +Ber +ook Sy-RW enot go baldus",om,"SCATTERING THEORY +The Quantum Theory of +Nonrelativistic Collisions +JOHN R. TAYLOR +University of Colorado +ostaliga Lanbidean +1 ilde +balloons big goin +gdĐOL, SIVI 23 TL +there in obl +och yd change +a +Ber +ook Sy-RW isn't going anywhere" +2,./test/data/IMG_3757.jpg,"THE +ALGEBRAIC +EIGENVALUE +PROBLEM +DOM +NVS TIO +MINA +Monographs +on Numerical Analysis +J.. H. WILKINSON",en,"THE +ALGEBRAIC +EIGENVALUE +PROBLEM +DOM +NVS TIO +MINA +Monographs +on Numerical Analysis +J.. H. WILKINSON" diff --git a/misinformation/test/data/topic_analysis_test.csv b/misinformation/test/data/topic_analysis_test.csv new file mode 100644 index 0000000..ba727f1 --- /dev/null +++ b/misinformation/test/data/topic_analysis_test.csv @@ -0,0 +1,190 @@ +text_english +Mercury: Retrograde +Pathology +Symbiote +ProductOfDrugs (Prod. The Virus and Antidote) +Venom +Gatteka +kamikaze (+ pulse) +T.R.U. (Totally Rotten Underground) +I Put My Dick in Your Mental +Andromeda +BRAINFOOD +Troll Under the Bridge +1000 Rounds +Sacrifice +Backpack +D(R)Own +"Okay +TakingOutTheTrash +Io sono qui +Paris +Murder +High 'N Mighty +Euronymous +Hades +Nails +Squeeze +No Teeth +Bang Ya Fucking Head +BLUE JUICE +Loch Ness +Hold Uh +Bone Saw +Coffin Wave +OhNo! +TheArtOfCremation +OakGroveRoad +WhatWasThat +FunnyToSeeYouHere +John Dee +Kybalion +Killer +608 +Eternal Dreams +Nightmare Choir (I Been Asleep Too Long) +Exodus +Vengeance +Claustrophobia +Rearranged +Paralax +Exsanguination +Mutiny +Centipede +Грустная сука +This World Is Sick +Пламя +2:45 +who is he +Sleeping +Timeless +Pound for Pound +Finger Trembling +Overload +Kill Yourself (Part III) +2nd Hand +Antarctica +Memoirs Of A Gorilla +Runnin' Thru The 7th With My Woadies +Mount Sinai +FUCKTHEPOPULATION +Magazine +2 Hot 4 U (feat. $Uicdeboy$) +O Pana! +LTE +Champion Of Death +Seppuku (feat. Suicideboy$ & Jgrxxn) +You're Now Tuning Into 66.6 FM With DJ Rapture (The Hottest Hour Of The Evening) +Slip On A Banana Clip +A Death In The Ocean Would Be Beautiful +Shattered Amethyst +Goosebumps +Venom +Bury Me +Hack Slash +2000 Rounds +Sea Sick +Grain +"Beware +Kali Yuga +Hexada +Caligula +Niagara (feat. Lil Peep) +Scrying Through Shattered Glass +Polaris +Rapture +Blackmage +Tartarus +Until the Light Takes Us +As Above so Look out Below +Swan +Sneak Diss (feat. So6ix) +Plague Doctor Mask +Some of Us May Never See the World +Filth +Homecoming +Blood +Sweat +Tears +Anabolic +HDMI +Dirt +Oxygen +Branches +CtrlAltDelete +BlastZone (ЗонаПоражения) +CharacterSelect (ВыборПерсонажа) +RestInPeace (Prod. by The Virus And Antidote) +BlackMold +Toxin +Electric +Cranium +Friday +Hooky +Kalaxian Crystals +Slurp +BROKE ft. Prohibeo +Lies +Terry McGinnis +Gremlin +Giant Squit +You Are Not Like Us +Arachnids +Give Ah Fuck +Death Wish +Allergies +Cut Throat +Memoirs of a Gorilla +Benz Truck (гелик) +Norf Norf +Dat $tick +"RAF (feat. A$AP Rocky +Crazy +Still Cold / Pathway Private +The Chills +Slip on a Banana Clip +Lights +Akina Speed Star +Big Fish +The Bodies Fall Just Like the Leaves +Story: No Title +P.S Fuck You Cunt (feat. Lil Peep) +Torch +"Buff Squad (feat. Pouya +Sarcophagus III (feat. $Uicideboy$) +Virginia Tech +Lte +Fuckthepopulation +Gloss of Blood +100K +Dark Light +"But Wait +Great Influence +It Don't Matter +absolute in doubt +Boss +Look at Me Now +Bulletproof +Contraband +Deira City Centre +Kyoto +Pull Out Game +Bird Is The Word +Life Is Short +Here We Go Again +Bloodshed +Wassup Bro! +ACT 2 - BirthOfTheSpaceGod +Grey Tee +Sleeping Bag +Afterlife +King Cobra (Drippin') +Heart Attack +Chain$Aw +"King +P.T.S.D +Brand New +Jukai +Philosopher's Throne +PRBLMS +Back At It diff --git a/misinformation/test/test_text.py b/misinformation/test/test_text.py index d8cdd16..9e71349 100644 --- a/misinformation/test/test_text.py +++ b/misinformation/test/test_text.py @@ -2,6 +2,8 @@ import os import pytest import spacy import misinformation.text as tt +import misinformation +import pandas as pd TESTDICT = { "IMG_3755": { @@ -29,7 +31,6 @@ def test_TextDetector(): assert test_obj.subdict["text_language"] is None assert test_obj.subdict["text_english"] is None assert not test_obj.analyse_text - assert not test_obj.analyse_topic @pytest.mark.gcv @@ -39,7 +40,6 @@ def test_analyse_image(): test_obj.analyse_image() test_obj = tt.TextDetector(TESTDICT[item], analyse_text=True) test_obj.analyse_image() - test_obj = tt.TextDetector(TESTDICT[item], analyse_topic=True) @pytest.mark.gcv @@ -68,6 +68,15 @@ def test_translate_text(): assert test_obj.subdict["text_english"] == translated_text +def test_remove_linebreaks(): + test_obj = tt.TextDetector({}) + test_obj.subdict["text"] = "This is \n a test." + test_obj.subdict["text_english"] = "This is \n another\n test." + test_obj.remove_linebreaks() + assert test_obj.subdict["text"] == "This is a test." + assert test_obj.subdict["text_english"] == "This is another test." + + def test_run_spacy(): test_obj = tt.TextDetector(TESTDICT["IMG_3755"], analyse_text=True) ref_file = "./test/data/text_IMG_3755.txt" @@ -106,3 +115,34 @@ def test_sentiment_analysis(): test_obj.sentiment_analysis() assert test_obj.subdict["polarity"] == 0.5 assert test_obj.subdict["subjectivity"] == 0.6 + + +def test_PostprocessText(): + reference_dict = "THE\nALGEBRAIC\nEIGENVALUE\nPROBLEM\nDOM\nNVS TIO\nMINA\nMonographs\non Numerical Analysis\nJ.. H. WILKINSON" + reference_df = "Mathematische Formelsammlung\nfür Ingenieure und Naturwissenschaftler\nMit zahlreichen Abbildungen und Rechenbeispielen\nund einer ausführlichen Integraltafel\n3., verbesserte Auflage" + obj = tt.PostprocessText(mydict=TESTDICT) + # make sure test works on windows where end-of-line character is \r\n + test_dict = obj.list_text_english[2].replace("\r", "") + assert test_dict == reference_dict + for key in TESTDICT.keys(): + TESTDICT[key].pop("text_english") + with pytest.raises(ValueError): + tt.PostprocessText(mydict=TESTDICT) + obj = tt.PostprocessText(use_csv=True, csv_path="./test/data/test_data_out.csv") + # make sure test works on windows where end-of-line character is \r\n + test_df = obj.list_text_english[0].replace("\r", "") + assert test_df == reference_df + with pytest.raises(ValueError): + tt.PostprocessText(use_csv=True, csv_path="./test/data/test_data_out_nokey.csv") + with pytest.raises(ValueError): + tt.PostprocessText() + + +def test_analyse_topic(): + _, topic_df, most_frequent_topics = tt.PostprocessText( + use_csv=True, csv_path="./test/data/topic_analysis_test.csv" + ).analyse_topic() + # since this is not deterministic we cannot be sure we get the same result twice + assert len(topic_df) == 2 + assert topic_df["Name"].iloc[0] == "0_the_feat_of_is" + assert most_frequent_topics[0][0][0] == "the" diff --git a/misinformation/text.py b/misinformation/text.py index 2af06af..9466f30 100644 --- a/misinformation/text.py +++ b/misinformation/text.py @@ -6,6 +6,9 @@ from textblob import TextBlob from textblob import download_corpora import io from misinformation import utils +import grpc +import pandas as pd +from bertopic import BERTopic # make widgets work again # clean text has weird spaces and separation of "do n't" @@ -13,14 +16,11 @@ from misinformation import utils class TextDetector(utils.AnalysisMethod): - def __init__( - self, subdict: dict, analyse_text: bool = False, analyse_topic: bool = False - ) -> None: + def __init__(self, subdict: dict, analyse_text: bool = False) -> None: super().__init__(subdict) self.subdict.update(self.set_keys()) self.translator = Translator() self.analyse_text = analyse_text - self.analyse_topic = analyse_topic if self.analyse_text: self._initialize_spacy() self._initialize_textblob() @@ -46,13 +46,12 @@ class TextDetector(utils.AnalysisMethod): def analyse_image(self): self.get_text_from_image() self.translate_text() + self.remove_linebreaks() if self.analyse_text: self._run_spacy() self.clean_text() self.correct_spelling() self.sentiment_analysis() - if self.analyse_topic: - self.analyse_topic() return self.subdict def get_text_from_image(self): @@ -62,12 +61,19 @@ class TextDetector(utils.AnalysisMethod): with io.open(path, "rb") as image_file: content = image_file.read() image = vision.Image(content=content) - response = client.text_detection(image=image) - texts = response.text_annotations[0].description - # here check if text was found - if texts: + # check for usual connection errors and retry if necessary + try: + response = client.text_detection(image=image) + except grpc.RpcError as exc: + print("Cloud vision API connection failed") + print("Skipping this image ..{}".format(path)) + print("Connection failed with code {}: {}".format(exc.code(), exc)) + # here check if text was found on image + if response: + texts = response.text_annotations[0].description self.subdict["text"] = texts if response.error.message: + print("Google Cloud Vision Error") raise ValueError( "{}\nFor more info on error messages, check: " "https://cloud.google.com/apis/design/errors".format( @@ -80,6 +86,14 @@ class TextDetector(utils.AnalysisMethod): self.subdict["text_language"] = translated.src self.subdict["text_english"] = translated.text + def remove_linebreaks(self): + """Remove linebreaks from original and translated text.""" + if self.subdict["text"]: + self.subdict["text"] = self.subdict["text"].replace("\n", " ") + self.subdict["text_english"] = self.subdict["text_english"].replace( + "\n", " " + ) + def _run_spacy(self): """Generate spacy doc object.""" self.doc = self.nlp(self.subdict["text_english"]) @@ -105,5 +119,73 @@ class TextDetector(utils.AnalysisMethod): # where 0.0 is very objective and 1.0 is very subjective self.subdict["subjectivity"] = self.doc._.blob.subjectivity - def analyse_topic(self): - pass + +class PostprocessText: + def __init__( + self, mydict: dict = None, use_csv: bool = False, csv_path: str = None + ) -> None: + self.use_csv = use_csv + if mydict: + print("Reading data from dict.") + self.mydict = mydict + self.list_text_english = self.get_text_dict() + elif self.use_csv: + print("Reading data from df.") + self.df = pd.read_csv(csv_path, encoding="utf8") + self.list_text_english = self.get_text_df() + else: + raise ValueError( + "Please provide either dictionary with textual data or \ + a csv file by setting `use_csv` to True and providing a \ + `csv_path`." + ) + + def analyse_topic(self, return_topics: int = 3): + """Topic analysis using BERTopic.""" + # load spacy pipeline + nlp = spacy.load( + "en_core_web_md", + exclude=["tagger", "parser", "ner", "attribute_ruler", "lemmatizer"], + ) + try: + # unfortunately catching exceptions does not work here - need to figure out why + self.topic_model = BERTopic(embedding_model=nlp) + except TypeError: + print("BERTopic excited with an error - maybe your dataset is too small?") + self.topics, self.probs = self.topic_model.fit_transform(self.list_text_english) + # return the topic list + topic_df = self.topic_model.get_topic_info() + # return the most frequent return_topics + most_frequent_topics = [] + if len(topic_df) < return_topics: + print("You requested more topics than are identified in your dataset -") + print( + "Returning only {} topics as these are all that have been found.".format( + len(topic_df) + ) + ) + for i in range(min(return_topics, len(topic_df))): + most_frequent_topics.append(self.topic_model.get_topic(i)) + return self.topic_model, topic_df, most_frequent_topics + + def get_text_dict(self): + # use dict to put text_english in list + list_text_english = [] + for key in self.mydict.keys(): + if "text_english" not in self.mydict[key]: + raise ValueError( + "Please check your provided dictionary - \ + no english text data found." + ) + list_text_english.append(self.mydict[key]["text_english"]) + return list_text_english + + def get_text_df(self): + # use csv file to obtain dataframe and put text_english in list + # check that "text_english" is there + if "text_english" not in self.df: + raise ValueError( + "Please check your provided dataframe - \ + no english text data found." + ) + return self.df["text_english"].tolist() diff --git a/notebooks/get-text-from-image.ipynb b/notebooks/get-text-from-image.ipynb index 32e8922..7666a88 100644 --- a/notebooks/get-text-from-image.ipynb +++ b/notebooks/get-text-from-image.ipynb @@ -42,7 +42,18 @@ "import os\n", "from IPython.display import Image, display\n", "import misinformation\n", + "import tensorflow as tf\n", "\n", + "print(tf.config.list_physical_devices(\"GPU\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27675810", + "metadata": {}, + "outputs": [], + "source": [ "# download the models if they are not there yet\n", "!python -m spacy download en_core_web_md\n", "!python -m textblob.download_corpora" @@ -55,9 +66,7 @@ "metadata": {}, "outputs": [], "source": [ - "images = misinformation.find_files(\n", - " path=\"drive/MyDrive/misinformation-data/\", limit=1000\n", - ")" + "images = misinformation.find_files(path=\"../data/all/\", limit=1000)" ] }, { @@ -78,7 +87,7 @@ "metadata": {}, "outputs": [], "source": [ - "mydict = misinformation.utils.initialize_dict(images[0:10])" + "mydict = misinformation.utils.initialize_dict(images[0:3])" ] }, { @@ -99,7 +108,7 @@ "source": [ "os.environ[\n", " \"GOOGLE_APPLICATION_CREDENTIALS\"\n", - "] = \"drive/MyDrive/misinformation-data/misinformation-campaign-981aa55a3b13.json\"" + "] = \"../data/misinformation-campaign-981aa55a3b13.json\"" ] }, { @@ -180,13 +189,143 @@ "outputs": [], "source": [ "# Write the csv\n", - "df.to_csv(\"drive/MyDrive/misinformation-data/data_out.csv\")" + "df.to_csv(\"./data_out.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "4bc8ac0a", + "metadata": {}, + "source": [ + "# Topic analysis\n", + "The topic analysis is carried out using [BERTopic](https://maartengr.github.io/BERTopic/index.html) using an embedded model through a [spaCy](https://spacy.io/) pipeline." + ] + }, + { + "cell_type": "markdown", + "id": "4931941b", + "metadata": {}, + "source": [ + "BERTopic takes a list of strings as input. The more items in the list, the better for the topic modeling. If the below returns an error for `analyse_topic()`, the reason can be that your dataset is too small.\n", + "### Option 1: Use the dictionary as obtained from the above analysis." ] }, { "cell_type": "code", "execution_count": null, - "id": "568537df", + "id": "a3450a61", + "metadata": {}, + "outputs": [], + "source": [ + "# make a list of all the text_english entries per analysed image from the mydict variable as above\n", + "topic_model, topic_df, most_frequent_topics = misinformation.text.PostprocessText(\n", + " mydict=mydict\n", + ").analyse_topic()" + ] + }, + { + "cell_type": "markdown", + "id": "95667342", + "metadata": {}, + "source": [ + "### Option 2: Read in a csv\n", + "Not to analyse too many images on google Cloud Vision, use the csv output to obtain the text (when rerunning already analysed images)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5530e436", + "metadata": {}, + "outputs": [], + "source": [ + "input_file_path = \"data_out.csv\"\n", + "topic_model, topic_df, most_frequent_topics = misinformation.text.PostprocessText(\n", + " use_csv=True, csv_path=input_file_path\n", + ").analyse_topic(return_topics=10)" + ] + }, + { + "cell_type": "markdown", + "id": "0b6ef6d7", + "metadata": {}, + "source": [ + "### Access frequent topics\n", + "A topic of `-1` stands for an outlier and should be ignored. Topic count is the number of occurence of that topic. The output is structured from most frequent to least frequent topic." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43288cda-61bb-4ff1-a209-dcfcc4916b1f", + "metadata": {}, + "outputs": [], + "source": [ + "print(topic_df)" + ] + }, + { + "cell_type": "markdown", + "id": "b3316770", + "metadata": {}, + "source": [ + "### Get information for specific topic\n", + "The most frequent topics can be accessed through `most_frequent_topics` with the most occuring topics first in the list." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db14fe03", + "metadata": {}, + "outputs": [], + "source": [ + "for topic in most_frequent_topics:\n", + " print(\"Topic:\", topic)" + ] + }, + { + "cell_type": "markdown", + "id": "d10f701e", + "metadata": {}, + "source": [ + "### Topic visualization\n", + "The topics can also be visualized. Careful: This only works if there is sufficient data (quantity and quality)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2331afe6", + "metadata": {}, + "outputs": [], + "source": [ + "topic_model.visualize_topics()" + ] + }, + { + "cell_type": "markdown", + "id": "f4eaf353", + "metadata": {}, + "source": [ + "### Save the model\n", + "The model can be saved for future use." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5e8377c", + "metadata": {}, + "outputs": [], + "source": [ + "topic_model.save(\"misinfo_posts\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c94edb9", "metadata": {}, "outputs": [], "source": [] @@ -194,7 +333,7 @@ ], "metadata": { "kernelspec": { - "display_name": "misinf", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -208,7 +347,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6 (main, Oct 24 2022, 16:07:47) [GCC 11.2.0]" + "version": "3.10.6" }, "vscode": { "interpreter": { diff --git a/pyproject.toml b/pyproject.toml index b584314..9a3fc1e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,9 @@ dependencies = [ "jupyterlab", "spacytextblob", "textblob", + "bertopic", + "grpcio", + "pandas", ] [project.scripts] diff --git a/requirements.txt b/requirements.txt index 702bca7..7abae00 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,4 +22,7 @@ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1 jupyterlab spacytextblob textblob -git+https://github.com/sloria/TextBlob.git@dev \ No newline at end of file +git+https://github.com/sloria/TextBlob.git@dev +bertopic +grpcio +pandas \ No newline at end of file