зеркало из
				https://github.com/ssciwr/AMMICO.git
				synced 2025-10-30 21:46:04 +02:00 
			
		
		
		
	Merge branch 'main' into add_image_summary
Этот коммит содержится в:
		
						Коммит
						f787164572
					
				
							
								
								
									
										52
									
								
								misinformation/test/data/test_data_out.csv
									
									
									
									
									
										Обычный файл
									
								
							
							
						
						
									
										52
									
								
								misinformation/test/data/test_data_out.csv
									
									
									
									
									
										Обычный файл
									
								
							| @ -0,0 +1,52 @@ | ||||
| ,filename,text,text_language,text_english | ||||
| 0,./test/data/IMG_3755.jpg,,,"Mathematische Formelsammlung | ||||
| für Ingenieure und Naturwissenschaftler | ||||
| Mit zahlreichen Abbildungen und Rechenbeispielen | ||||
| und einer ausführlichen Integraltafel | ||||
| 3., verbesserte Auflage" | ||||
| 1,./test/data/IMG_3756.jpg,"SCATTERING THEORY | ||||
| The Quantum Theory of | ||||
| Nonrelativistic Collisions | ||||
| JOHN R. TAYLOR | ||||
| University of Colorado | ||||
| ostaliga Lanbidean | ||||
| 1 ilde | ||||
| ballenger stor goin | ||||
| gdĐOL, SIVI 23 TL 02 | ||||
| de in obl | ||||
| och yd badalang | ||||
| a | ||||
| Ber | ||||
| ook Sy-RW enot go baldus",om,"SCATTERING THEORY | ||||
| The Quantum Theory of | ||||
| Nonrelativistic Collisions | ||||
| JOHN R. TAYLOR | ||||
| University of Colorado | ||||
| ostaliga Lanbidean | ||||
| 1 ilde | ||||
| balloons big goin | ||||
| gdĐOL, SIVI 23 TL | ||||
| there in obl | ||||
| och yd change | ||||
| a | ||||
| Ber | ||||
| ook Sy-RW isn't going anywhere" | ||||
| 2,./test/data/IMG_3757.jpg,"THE | ||||
| ALGEBRAIC | ||||
| EIGENVALUE | ||||
| PROBLEM | ||||
| DOM | ||||
| NVS TIO | ||||
| MINA | ||||
| Monographs | ||||
| on Numerical Analysis | ||||
| J.. H. WILKINSON",en,"THE | ||||
| ALGEBRAIC | ||||
| EIGENVALUE | ||||
| PROBLEM | ||||
| DOM | ||||
| NVS TIO | ||||
| MINA | ||||
| Monographs | ||||
| on Numerical Analysis | ||||
| J.. H. WILKINSON" | ||||
| 
 | 
							
								
								
									
										52
									
								
								misinformation/test/data/test_data_out_nokey.csv
									
									
									
									
									
										Обычный файл
									
								
							
							
						
						
									
										52
									
								
								misinformation/test/data/test_data_out_nokey.csv
									
									
									
									
									
										Обычный файл
									
								
							| @ -0,0 +1,52 @@ | ||||
| ,filename,text,text_language,text_nglish | ||||
| 0,./test/data/IMG_3755.jpg,,,"Mathematische Formelsammlung | ||||
| für Ingenieure und Naturwissenschaftler | ||||
| Mit zahlreichen Abbildungen und Rechenbeispielen | ||||
| und einer ausführlichen Integraltafel | ||||
| 3., verbesserte Auflage" | ||||
| 1,./test/data/IMG_3756.jpg,"SCATTERING THEORY | ||||
| The Quantum Theory of | ||||
| Nonrelativistic Collisions | ||||
| JOHN R. TAYLOR | ||||
| University of Colorado | ||||
| ostaliga Lanbidean | ||||
| 1 ilde | ||||
| ballenger stor goin | ||||
| gdĐOL, SIVI 23 TL 02 | ||||
| de in obl | ||||
| och yd badalang | ||||
| a | ||||
| Ber | ||||
| ook Sy-RW enot go baldus",om,"SCATTERING THEORY | ||||
| The Quantum Theory of | ||||
| Nonrelativistic Collisions | ||||
| JOHN R. TAYLOR | ||||
| University of Colorado | ||||
| ostaliga Lanbidean | ||||
| 1 ilde | ||||
| balloons big goin | ||||
| gdĐOL, SIVI 23 TL | ||||
| there in obl | ||||
| och yd change | ||||
| a | ||||
| Ber | ||||
| ook Sy-RW isn't going anywhere" | ||||
| 2,./test/data/IMG_3757.jpg,"THE | ||||
| ALGEBRAIC | ||||
| EIGENVALUE | ||||
| PROBLEM | ||||
| DOM | ||||
| NVS TIO | ||||
| MINA | ||||
| Monographs | ||||
| on Numerical Analysis | ||||
| J.. H. WILKINSON",en,"THE | ||||
| ALGEBRAIC | ||||
| EIGENVALUE | ||||
| PROBLEM | ||||
| DOM | ||||
| NVS TIO | ||||
| MINA | ||||
| Monographs | ||||
| on Numerical Analysis | ||||
| J.. H. WILKINSON" | ||||
| 
 | 
							
								
								
									
										190
									
								
								misinformation/test/data/topic_analysis_test.csv
									
									
									
									
									
										Обычный файл
									
								
							
							
						
						
									
										190
									
								
								misinformation/test/data/topic_analysis_test.csv
									
									
									
									
									
										Обычный файл
									
								
							| @ -0,0 +1,190 @@ | ||||
| text_english | ||||
| Mercury: Retrograde | ||||
| Pathology | ||||
| Symbiote | ||||
| ProductOfDrugs (Prod. The Virus and Antidote) | ||||
| Venom | ||||
| Gatteka | ||||
| kamikaze (+ pulse) | ||||
| T.R.U. (Totally Rotten Underground) | ||||
| I Put My Dick in Your Mental | ||||
| Andromeda | ||||
| BRAINFOOD | ||||
| Troll Under the Bridge | ||||
| 1000 Rounds | ||||
| Sacrifice | ||||
| Backpack | ||||
| D(R)Own | ||||
| "Okay | ||||
| TakingOutTheTrash | ||||
| Io sono qui | ||||
| Paris | ||||
| Murder | ||||
| High 'N Mighty | ||||
| Euronymous | ||||
| Hades | ||||
| Nails | ||||
| Squeeze | ||||
| No Teeth | ||||
| Bang Ya Fucking Head | ||||
| BLUE JUICE | ||||
| Loch Ness | ||||
| Hold Uh | ||||
| Bone Saw | ||||
| Coffin Wave | ||||
| OhNo! | ||||
| TheArtOfCremation | ||||
| OakGroveRoad | ||||
| WhatWasThat | ||||
| FunnyToSeeYouHere | ||||
| John Dee | ||||
| Kybalion | ||||
| Killer | ||||
| 608 | ||||
| Eternal Dreams | ||||
| Nightmare Choir (I Been Asleep Too Long) | ||||
| Exodus | ||||
| Vengeance | ||||
| Claustrophobia | ||||
| Rearranged | ||||
| Paralax | ||||
| Exsanguination | ||||
| Mutiny | ||||
| Centipede | ||||
| Грустная сука | ||||
| This World Is Sick | ||||
| Пламя | ||||
| 2:45 | ||||
| who is he | ||||
| Sleeping | ||||
| Timeless | ||||
| Pound for Pound | ||||
| Finger Trembling | ||||
| Overload | ||||
| Kill Yourself (Part III) | ||||
| 2nd Hand | ||||
| Antarctica | ||||
| Memoirs Of A Gorilla | ||||
| Runnin' Thru The 7th With My Woadies | ||||
| Mount Sinai | ||||
| FUCKTHEPOPULATION | ||||
| Magazine | ||||
| 2 Hot 4 U (feat. $Uicdeboy$) | ||||
| O Pana! | ||||
| LTE | ||||
| Champion Of Death | ||||
| Seppuku (feat. Suicideboy$ & Jgrxxn) | ||||
| You're Now Tuning Into 66.6 FM With DJ Rapture (The Hottest Hour Of The Evening) | ||||
| Slip On A Banana Clip | ||||
| A Death In The Ocean Would Be Beautiful | ||||
| Shattered Amethyst | ||||
| Goosebumps | ||||
| Venom | ||||
| Bury Me | ||||
| Hack Slash | ||||
| 2000 Rounds | ||||
| Sea Sick | ||||
| Grain | ||||
| "Beware | ||||
| Kali Yuga | ||||
| Hexada | ||||
| Caligula | ||||
| Niagara (feat. Lil Peep) | ||||
| Scrying Through Shattered Glass | ||||
| Polaris | ||||
| Rapture | ||||
| Blackmage | ||||
| Tartarus | ||||
| Until the Light Takes Us | ||||
| As Above so Look out Below | ||||
| Swan | ||||
| Sneak Diss (feat. So6ix) | ||||
| Plague Doctor Mask | ||||
| Some of Us May Never See the World | ||||
| Filth | ||||
| Homecoming | ||||
| Blood | ||||
| Sweat | ||||
| Tears | ||||
| Anabolic | ||||
| HDMI | ||||
| Dirt | ||||
| Oxygen | ||||
| Branches | ||||
| CtrlAltDelete | ||||
| BlastZone (ЗонаПоражения) | ||||
| CharacterSelect (ВыборПерсонажа) | ||||
| RestInPeace (Prod. by The Virus And Antidote) | ||||
| BlackMold | ||||
| Toxin | ||||
| Electric | ||||
| Cranium | ||||
| Friday | ||||
| Hooky | ||||
| Kalaxian Crystals | ||||
| Slurp | ||||
| BROKE ft. Prohibeo | ||||
| Lies | ||||
| Terry McGinnis | ||||
| Gremlin | ||||
| Giant Squit | ||||
| You Are Not Like Us | ||||
| Arachnids | ||||
| Give Ah Fuck | ||||
| Death Wish | ||||
| Allergies | ||||
| Cut Throat | ||||
| Memoirs of a Gorilla | ||||
| Benz Truck (гелик) | ||||
| Norf Norf | ||||
| Dat $tick | ||||
| "RAF (feat. A$AP Rocky | ||||
| Crazy | ||||
| Still Cold / Pathway Private | ||||
| The Chills | ||||
| Slip on a Banana Clip | ||||
| Lights | ||||
| Akina Speed Star | ||||
| Big Fish | ||||
| The Bodies Fall Just Like the Leaves | ||||
| Story: No Title | ||||
| P.S Fuck You Cunt (feat. Lil Peep) | ||||
| Torch | ||||
| "Buff Squad (feat. Pouya | ||||
| Sarcophagus III (feat. $Uicideboy$) | ||||
| Virginia Tech | ||||
| Lte | ||||
| Fuckthepopulation | ||||
| Gloss of Blood | ||||
| 100K | ||||
| Dark Light | ||||
| "But Wait | ||||
| Great Influence | ||||
| It Don't Matter | ||||
| absolute in doubt | ||||
| Boss | ||||
| Look at Me Now | ||||
| Bulletproof | ||||
| Contraband | ||||
| Deira City Centre | ||||
| Kyoto | ||||
| Pull Out Game | ||||
| Bird Is The Word | ||||
| Life Is Short | ||||
| Here We Go Again | ||||
| Bloodshed | ||||
| Wassup Bro! | ||||
| ACT 2 - BirthOfTheSpaceGod | ||||
| Grey Tee | ||||
| Sleeping Bag | ||||
| Afterlife | ||||
| King Cobra (Drippin') | ||||
| Heart Attack | ||||
| Chain$Aw | ||||
| "King | ||||
| P.T.S.D | ||||
| Brand New | ||||
| Jukai | ||||
| Philosopher's Throne | ||||
| PRBLMS | ||||
| Back At It | ||||
| Не удается отобразить этот файл, потому что он содержит неожиданный символ в строке 88 и столбце 1. | 
| @ -2,6 +2,8 @@ import os | ||||
| import pytest | ||||
| import spacy | ||||
| import misinformation.text as tt | ||||
| import misinformation | ||||
| import pandas as pd | ||||
| 
 | ||||
| TESTDICT = { | ||||
|     "IMG_3755": { | ||||
| @ -29,7 +31,6 @@ def test_TextDetector(): | ||||
|         assert test_obj.subdict["text_language"] is None | ||||
|         assert test_obj.subdict["text_english"] is None | ||||
|         assert not test_obj.analyse_text | ||||
|         assert not test_obj.analyse_topic | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.gcv | ||||
| @ -39,7 +40,6 @@ def test_analyse_image(): | ||||
|         test_obj.analyse_image() | ||||
|         test_obj = tt.TextDetector(TESTDICT[item], analyse_text=True) | ||||
|         test_obj.analyse_image() | ||||
|         test_obj = tt.TextDetector(TESTDICT[item], analyse_topic=True) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.gcv | ||||
| @ -68,6 +68,15 @@ def test_translate_text(): | ||||
|         assert test_obj.subdict["text_english"] == translated_text | ||||
| 
 | ||||
| 
 | ||||
| def test_remove_linebreaks(): | ||||
|     test_obj = tt.TextDetector({}) | ||||
|     test_obj.subdict["text"] = "This is \n a test." | ||||
|     test_obj.subdict["text_english"] = "This is \n another\n test." | ||||
|     test_obj.remove_linebreaks() | ||||
|     assert test_obj.subdict["text"] == "This is   a test." | ||||
|     assert test_obj.subdict["text_english"] == "This is   another  test." | ||||
| 
 | ||||
| 
 | ||||
| def test_run_spacy(): | ||||
|     test_obj = tt.TextDetector(TESTDICT["IMG_3755"], analyse_text=True) | ||||
|     ref_file = "./test/data/text_IMG_3755.txt" | ||||
| @ -106,3 +115,34 @@ def test_sentiment_analysis(): | ||||
|     test_obj.sentiment_analysis() | ||||
|     assert test_obj.subdict["polarity"] == 0.5 | ||||
|     assert test_obj.subdict["subjectivity"] == 0.6 | ||||
| 
 | ||||
| 
 | ||||
| def test_PostprocessText(): | ||||
|     reference_dict = "THE\nALGEBRAIC\nEIGENVALUE\nPROBLEM\nDOM\nNVS TIO\nMINA\nMonographs\non Numerical Analysis\nJ.. H. WILKINSON" | ||||
|     reference_df = "Mathematische Formelsammlung\nfür Ingenieure und Naturwissenschaftler\nMit zahlreichen Abbildungen und Rechenbeispielen\nund einer ausführlichen Integraltafel\n3., verbesserte Auflage" | ||||
|     obj = tt.PostprocessText(mydict=TESTDICT) | ||||
|     # make sure test works on windows where end-of-line character is \r\n | ||||
|     test_dict = obj.list_text_english[2].replace("\r", "") | ||||
|     assert test_dict == reference_dict | ||||
|     for key in TESTDICT.keys(): | ||||
|         TESTDICT[key].pop("text_english") | ||||
|     with pytest.raises(ValueError): | ||||
|         tt.PostprocessText(mydict=TESTDICT) | ||||
|     obj = tt.PostprocessText(use_csv=True, csv_path="./test/data/test_data_out.csv") | ||||
|     # make sure test works on windows where end-of-line character is \r\n | ||||
|     test_df = obj.list_text_english[0].replace("\r", "") | ||||
|     assert test_df == reference_df | ||||
|     with pytest.raises(ValueError): | ||||
|         tt.PostprocessText(use_csv=True, csv_path="./test/data/test_data_out_nokey.csv") | ||||
|     with pytest.raises(ValueError): | ||||
|         tt.PostprocessText() | ||||
| 
 | ||||
| 
 | ||||
| def test_analyse_topic(): | ||||
|     _, topic_df, most_frequent_topics = tt.PostprocessText( | ||||
|         use_csv=True, csv_path="./test/data/topic_analysis_test.csv" | ||||
|     ).analyse_topic() | ||||
|     # since this is not deterministic we cannot be sure we get the same result twice | ||||
|     assert len(topic_df) == 2 | ||||
|     assert topic_df["Name"].iloc[0] == "0_the_feat_of_is" | ||||
|     assert most_frequent_topics[0][0][0] == "the" | ||||
|  | ||||
| @ -6,6 +6,9 @@ from textblob import TextBlob | ||||
| from textblob import download_corpora | ||||
| import io | ||||
| from misinformation import utils | ||||
| import grpc | ||||
| import pandas as pd | ||||
| from bertopic import BERTopic | ||||
| 
 | ||||
| # make widgets work again | ||||
| # clean text has weird spaces and separation of "do n't" | ||||
| @ -13,14 +16,11 @@ from misinformation import utils | ||||
| 
 | ||||
| 
 | ||||
| class TextDetector(utils.AnalysisMethod): | ||||
|     def __init__( | ||||
|         self, subdict: dict, analyse_text: bool = False, analyse_topic: bool = False | ||||
|     ) -> None: | ||||
|     def __init__(self, subdict: dict, analyse_text: bool = False) -> None: | ||||
|         super().__init__(subdict) | ||||
|         self.subdict.update(self.set_keys()) | ||||
|         self.translator = Translator() | ||||
|         self.analyse_text = analyse_text | ||||
|         self.analyse_topic = analyse_topic | ||||
|         if self.analyse_text: | ||||
|             self._initialize_spacy() | ||||
|             self._initialize_textblob() | ||||
| @ -46,13 +46,12 @@ class TextDetector(utils.AnalysisMethod): | ||||
|     def analyse_image(self): | ||||
|         self.get_text_from_image() | ||||
|         self.translate_text() | ||||
|         self.remove_linebreaks() | ||||
|         if self.analyse_text: | ||||
|             self._run_spacy() | ||||
|             self.clean_text() | ||||
|             self.correct_spelling() | ||||
|             self.sentiment_analysis() | ||||
|         if self.analyse_topic: | ||||
|             self.analyse_topic() | ||||
|         return self.subdict | ||||
| 
 | ||||
|     def get_text_from_image(self): | ||||
| @ -62,12 +61,19 @@ class TextDetector(utils.AnalysisMethod): | ||||
|         with io.open(path, "rb") as image_file: | ||||
|             content = image_file.read() | ||||
|         image = vision.Image(content=content) | ||||
|         response = client.text_detection(image=image) | ||||
|         texts = response.text_annotations[0].description | ||||
|         # here check if text was found | ||||
|         if texts: | ||||
|         # check for usual connection errors and retry if necessary | ||||
|         try: | ||||
|             response = client.text_detection(image=image) | ||||
|         except grpc.RpcError as exc: | ||||
|             print("Cloud vision API connection failed") | ||||
|             print("Skipping this image ..{}".format(path)) | ||||
|             print("Connection failed with code {}: {}".format(exc.code(), exc)) | ||||
|         # here check if text was found on image | ||||
|         if response: | ||||
|             texts = response.text_annotations[0].description | ||||
|             self.subdict["text"] = texts | ||||
|         if response.error.message: | ||||
|             print("Google Cloud Vision Error") | ||||
|             raise ValueError( | ||||
|                 "{}\nFor more info on error messages, check: " | ||||
|                 "https://cloud.google.com/apis/design/errors".format( | ||||
| @ -80,6 +86,14 @@ class TextDetector(utils.AnalysisMethod): | ||||
|         self.subdict["text_language"] = translated.src | ||||
|         self.subdict["text_english"] = translated.text | ||||
| 
 | ||||
|     def remove_linebreaks(self): | ||||
|         """Remove linebreaks from original and translated text.""" | ||||
|         if self.subdict["text"]: | ||||
|             self.subdict["text"] = self.subdict["text"].replace("\n", " ") | ||||
|             self.subdict["text_english"] = self.subdict["text_english"].replace( | ||||
|                 "\n", " " | ||||
|             ) | ||||
| 
 | ||||
|     def _run_spacy(self): | ||||
|         """Generate spacy doc object.""" | ||||
|         self.doc = self.nlp(self.subdict["text_english"]) | ||||
| @ -105,5 +119,73 @@ class TextDetector(utils.AnalysisMethod): | ||||
|         # where 0.0 is very objective and 1.0 is very subjective | ||||
|         self.subdict["subjectivity"] = self.doc._.blob.subjectivity | ||||
| 
 | ||||
|     def analyse_topic(self): | ||||
|         pass | ||||
| 
 | ||||
| class PostprocessText: | ||||
|     def __init__( | ||||
|         self, mydict: dict = None, use_csv: bool = False, csv_path: str = None | ||||
|     ) -> None: | ||||
|         self.use_csv = use_csv | ||||
|         if mydict: | ||||
|             print("Reading data from dict.") | ||||
|             self.mydict = mydict | ||||
|             self.list_text_english = self.get_text_dict() | ||||
|         elif self.use_csv: | ||||
|             print("Reading data from df.") | ||||
|             self.df = pd.read_csv(csv_path, encoding="utf8") | ||||
|             self.list_text_english = self.get_text_df() | ||||
|         else: | ||||
|             raise ValueError( | ||||
|                 "Please provide either dictionary with textual data or \ | ||||
|                               a csv file by setting `use_csv` to True and providing a \ | ||||
|                              `csv_path`." | ||||
|             ) | ||||
| 
 | ||||
|     def analyse_topic(self, return_topics: int = 3): | ||||
|         """Topic analysis using BERTopic.""" | ||||
|         # load spacy pipeline | ||||
|         nlp = spacy.load( | ||||
|             "en_core_web_md", | ||||
|             exclude=["tagger", "parser", "ner", "attribute_ruler", "lemmatizer"], | ||||
|         ) | ||||
|         try: | ||||
|             # unfortunately catching exceptions does not work here - need to figure out why | ||||
|             self.topic_model = BERTopic(embedding_model=nlp) | ||||
|         except TypeError: | ||||
|             print("BERTopic excited with an error - maybe your dataset is too small?") | ||||
|         self.topics, self.probs = self.topic_model.fit_transform(self.list_text_english) | ||||
|         # return the topic list | ||||
|         topic_df = self.topic_model.get_topic_info() | ||||
|         # return the most frequent return_topics | ||||
|         most_frequent_topics = [] | ||||
|         if len(topic_df) < return_topics: | ||||
|             print("You requested more topics than are identified in your dataset -") | ||||
|             print( | ||||
|                 "Returning only {} topics as these are all that have been found.".format( | ||||
|                     len(topic_df) | ||||
|                 ) | ||||
|             ) | ||||
|         for i in range(min(return_topics, len(topic_df))): | ||||
|             most_frequent_topics.append(self.topic_model.get_topic(i)) | ||||
|         return self.topic_model, topic_df, most_frequent_topics | ||||
| 
 | ||||
|     def get_text_dict(self): | ||||
|         # use dict to put text_english in list | ||||
|         list_text_english = [] | ||||
|         for key in self.mydict.keys(): | ||||
|             if "text_english" not in self.mydict[key]: | ||||
|                 raise ValueError( | ||||
|                     "Please check your provided dictionary - \ | ||||
|                 no english text data found." | ||||
|                 ) | ||||
|             list_text_english.append(self.mydict[key]["text_english"]) | ||||
|         return list_text_english | ||||
| 
 | ||||
|     def get_text_df(self): | ||||
|         # use csv file to obtain dataframe and put text_english in list | ||||
|         # check that "text_english" is there | ||||
|         if "text_english" not in self.df: | ||||
|             raise ValueError( | ||||
|                 "Please check your provided dataframe - \ | ||||
|                                 no english text data found." | ||||
|             ) | ||||
|         return self.df["text_english"].tolist() | ||||
|  | ||||
							
								
								
									
										157
									
								
								notebooks/get-text-from-image.ipynb
									
									
									
										сгенерированный
									
									
									
								
							
							
						
						
									
										157
									
								
								notebooks/get-text-from-image.ipynb
									
									
									
										сгенерированный
									
									
									
								
							| @ -42,7 +42,18 @@ | ||||
|     "import os\n", | ||||
|     "from IPython.display import Image, display\n", | ||||
|     "import misinformation\n", | ||||
|     "import tensorflow as tf\n", | ||||
|     "\n", | ||||
|     "print(tf.config.list_physical_devices(\"GPU\"))" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": null, | ||||
|    "id": "27675810", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "# download the models if they are not there yet\n", | ||||
|     "!python -m spacy download en_core_web_md\n", | ||||
|     "!python -m textblob.download_corpora" | ||||
| @ -55,9 +66,7 @@ | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "images = misinformation.find_files(\n", | ||||
|     "    path=\"drive/MyDrive/misinformation-data/\", limit=1000\n", | ||||
|     ")" | ||||
|     "images = misinformation.find_files(path=\"../data/all/\", limit=1000)" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
| @ -78,7 +87,7 @@ | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "mydict = misinformation.utils.initialize_dict(images[0:10])" | ||||
|     "mydict = misinformation.utils.initialize_dict(images[0:3])" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
| @ -99,7 +108,7 @@ | ||||
|    "source": [ | ||||
|     "os.environ[\n", | ||||
|     "    \"GOOGLE_APPLICATION_CREDENTIALS\"\n", | ||||
|     "] = \"drive/MyDrive/misinformation-data/misinformation-campaign-981aa55a3b13.json\"" | ||||
|     "] = \"../data/misinformation-campaign-981aa55a3b13.json\"" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
| @ -180,13 +189,143 @@ | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "# Write the csv\n", | ||||
|     "df.to_csv(\"drive/MyDrive/misinformation-data/data_out.csv\")" | ||||
|     "df.to_csv(\"./data_out.csv\")" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "4bc8ac0a", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "# Topic analysis\n", | ||||
|     "The topic analysis is carried out using [BERTopic](https://maartengr.github.io/BERTopic/index.html) using an embedded model through a [spaCy](https://spacy.io/) pipeline." | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "4931941b", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "BERTopic takes a list of strings as input. The more items in the list, the better for the topic modeling. If the below returns an error for `analyse_topic()`, the reason can be that your dataset is too small.\n", | ||||
|     "### Option 1: Use the dictionary as obtained from the above analysis." | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": null, | ||||
|    "id": "568537df", | ||||
|    "id": "a3450a61", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "# make a list of all the text_english entries per analysed image from the mydict variable as above\n", | ||||
|     "topic_model, topic_df, most_frequent_topics = misinformation.text.PostprocessText(\n", | ||||
|     "    mydict=mydict\n", | ||||
|     ").analyse_topic()" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "95667342", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "### Option 2: Read in a csv\n", | ||||
|     "Not to analyse too many images on google Cloud Vision, use the csv output to obtain the text (when rerunning already analysed images)." | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": null, | ||||
|    "id": "5530e436", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "input_file_path = \"data_out.csv\"\n", | ||||
|     "topic_model, topic_df, most_frequent_topics = misinformation.text.PostprocessText(\n", | ||||
|     "    use_csv=True, csv_path=input_file_path\n", | ||||
|     ").analyse_topic(return_topics=10)" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "0b6ef6d7", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "### Access frequent topics\n", | ||||
|     "A topic of `-1` stands for an outlier and should be ignored. Topic count is the number of occurence of that topic. The output is structured from most frequent to least frequent topic." | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": null, | ||||
|    "id": "43288cda-61bb-4ff1-a209-dcfcc4916b1f", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "print(topic_df)" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "b3316770", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "### Get information for specific topic\n", | ||||
|     "The most frequent topics can be accessed through `most_frequent_topics` with the most occuring topics first in the list." | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": null, | ||||
|    "id": "db14fe03", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "for topic in most_frequent_topics:\n", | ||||
|     "    print(\"Topic:\", topic)" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "d10f701e", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "### Topic visualization\n", | ||||
|     "The topics can also be visualized. Careful: This only works if there is sufficient data (quantity and quality)." | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": null, | ||||
|    "id": "2331afe6", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "topic_model.visualize_topics()" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "markdown", | ||||
|    "id": "f4eaf353", | ||||
|    "metadata": {}, | ||||
|    "source": [ | ||||
|     "### Save the model\n", | ||||
|     "The model can be saved for future use." | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": null, | ||||
|    "id": "e5e8377c", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "topic_model.save(\"misinfo_posts\")" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": null, | ||||
|    "id": "7c94edb9", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [] | ||||
| @ -194,7 +333,7 @@ | ||||
|  ], | ||||
|  "metadata": { | ||||
|   "kernelspec": { | ||||
|    "display_name": "misinf", | ||||
|    "display_name": "Python 3 (ipykernel)", | ||||
|    "language": "python", | ||||
|    "name": "python3" | ||||
|   }, | ||||
| @ -208,7 +347,7 @@ | ||||
|    "name": "python", | ||||
|    "nbconvert_exporter": "python", | ||||
|    "pygments_lexer": "ipython3", | ||||
|    "version": "3.10.6 (main, Oct 24 2022, 16:07:47) [GCC 11.2.0]" | ||||
|    "version": "3.10.6" | ||||
|   }, | ||||
|   "vscode": { | ||||
|    "interpreter": { | ||||
|  | ||||
| @ -48,6 +48,8 @@ dependencies = [ | ||||
|     "textblob", | ||||
|     "torch", | ||||
|     "salesforce-lavis @ git+https://github.com/salesforce/LAVIS.git@main" | ||||
|     "bertopic", | ||||
|     "grpcio", | ||||
| ] | ||||
| 
 | ||||
| [project.scripts] | ||||
|  | ||||
| @ -24,3 +24,5 @@ spacytextblob | ||||
| textblob | ||||
| git+https://github.com/sloria/TextBlob.git@dev | ||||
| git+https://github.com/salesforce/LAVIS.git@main | ||||
| bertopic | ||||
| grpcio | ||||
|  | ||||
		Загрузка…
	
	
			
			x
			
			
		
	
		Ссылка в новой задаче
	
	Block a user
	 Inga Ulusoy
						Inga Ulusoy