зеркало из
				https://github.com/ssciwr/AMMICO.git
				synced 2025-10-31 14:06:04 +02:00 
			
		
		
		
	Merge branch 'main' into add_image_summary
Этот коммит содержится в:
		
						Коммит
						f787164572
					
				
							
								
								
									
										52
									
								
								misinformation/test/data/test_data_out.csv
									
									
									
									
									
										Обычный файл
									
								
							
							
						
						
									
										52
									
								
								misinformation/test/data/test_data_out.csv
									
									
									
									
									
										Обычный файл
									
								
							| @ -0,0 +1,52 @@ | |||||||
|  | ,filename,text,text_language,text_english | ||||||
|  | 0,./test/data/IMG_3755.jpg,,,"Mathematische Formelsammlung | ||||||
|  | für Ingenieure und Naturwissenschaftler | ||||||
|  | Mit zahlreichen Abbildungen und Rechenbeispielen | ||||||
|  | und einer ausführlichen Integraltafel | ||||||
|  | 3., verbesserte Auflage" | ||||||
|  | 1,./test/data/IMG_3756.jpg,"SCATTERING THEORY | ||||||
|  | The Quantum Theory of | ||||||
|  | Nonrelativistic Collisions | ||||||
|  | JOHN R. TAYLOR | ||||||
|  | University of Colorado | ||||||
|  | ostaliga Lanbidean | ||||||
|  | 1 ilde | ||||||
|  | ballenger stor goin | ||||||
|  | gdĐOL, SIVI 23 TL 02 | ||||||
|  | de in obl | ||||||
|  | och yd badalang | ||||||
|  | a | ||||||
|  | Ber | ||||||
|  | ook Sy-RW enot go baldus",om,"SCATTERING THEORY | ||||||
|  | The Quantum Theory of | ||||||
|  | Nonrelativistic Collisions | ||||||
|  | JOHN R. TAYLOR | ||||||
|  | University of Colorado | ||||||
|  | ostaliga Lanbidean | ||||||
|  | 1 ilde | ||||||
|  | balloons big goin | ||||||
|  | gdĐOL, SIVI 23 TL | ||||||
|  | there in obl | ||||||
|  | och yd change | ||||||
|  | a | ||||||
|  | Ber | ||||||
|  | ook Sy-RW isn't going anywhere" | ||||||
|  | 2,./test/data/IMG_3757.jpg,"THE | ||||||
|  | ALGEBRAIC | ||||||
|  | EIGENVALUE | ||||||
|  | PROBLEM | ||||||
|  | DOM | ||||||
|  | NVS TIO | ||||||
|  | MINA | ||||||
|  | Monographs | ||||||
|  | on Numerical Analysis | ||||||
|  | J.. H. WILKINSON",en,"THE | ||||||
|  | ALGEBRAIC | ||||||
|  | EIGENVALUE | ||||||
|  | PROBLEM | ||||||
|  | DOM | ||||||
|  | NVS TIO | ||||||
|  | MINA | ||||||
|  | Monographs | ||||||
|  | on Numerical Analysis | ||||||
|  | J.. H. WILKINSON" | ||||||
| 
 | 
							
								
								
									
										52
									
								
								misinformation/test/data/test_data_out_nokey.csv
									
									
									
									
									
										Обычный файл
									
								
							
							
						
						
									
										52
									
								
								misinformation/test/data/test_data_out_nokey.csv
									
									
									
									
									
										Обычный файл
									
								
							| @ -0,0 +1,52 @@ | |||||||
|  | ,filename,text,text_language,text_nglish | ||||||
|  | 0,./test/data/IMG_3755.jpg,,,"Mathematische Formelsammlung | ||||||
|  | für Ingenieure und Naturwissenschaftler | ||||||
|  | Mit zahlreichen Abbildungen und Rechenbeispielen | ||||||
|  | und einer ausführlichen Integraltafel | ||||||
|  | 3., verbesserte Auflage" | ||||||
|  | 1,./test/data/IMG_3756.jpg,"SCATTERING THEORY | ||||||
|  | The Quantum Theory of | ||||||
|  | Nonrelativistic Collisions | ||||||
|  | JOHN R. TAYLOR | ||||||
|  | University of Colorado | ||||||
|  | ostaliga Lanbidean | ||||||
|  | 1 ilde | ||||||
|  | ballenger stor goin | ||||||
|  | gdĐOL, SIVI 23 TL 02 | ||||||
|  | de in obl | ||||||
|  | och yd badalang | ||||||
|  | a | ||||||
|  | Ber | ||||||
|  | ook Sy-RW enot go baldus",om,"SCATTERING THEORY | ||||||
|  | The Quantum Theory of | ||||||
|  | Nonrelativistic Collisions | ||||||
|  | JOHN R. TAYLOR | ||||||
|  | University of Colorado | ||||||
|  | ostaliga Lanbidean | ||||||
|  | 1 ilde | ||||||
|  | balloons big goin | ||||||
|  | gdĐOL, SIVI 23 TL | ||||||
|  | there in obl | ||||||
|  | och yd change | ||||||
|  | a | ||||||
|  | Ber | ||||||
|  | ook Sy-RW isn't going anywhere" | ||||||
|  | 2,./test/data/IMG_3757.jpg,"THE | ||||||
|  | ALGEBRAIC | ||||||
|  | EIGENVALUE | ||||||
|  | PROBLEM | ||||||
|  | DOM | ||||||
|  | NVS TIO | ||||||
|  | MINA | ||||||
|  | Monographs | ||||||
|  | on Numerical Analysis | ||||||
|  | J.. H. WILKINSON",en,"THE | ||||||
|  | ALGEBRAIC | ||||||
|  | EIGENVALUE | ||||||
|  | PROBLEM | ||||||
|  | DOM | ||||||
|  | NVS TIO | ||||||
|  | MINA | ||||||
|  | Monographs | ||||||
|  | on Numerical Analysis | ||||||
|  | J.. H. WILKINSON" | ||||||
| 
 | 
							
								
								
									
										190
									
								
								misinformation/test/data/topic_analysis_test.csv
									
									
									
									
									
										Обычный файл
									
								
							
							
						
						
									
										190
									
								
								misinformation/test/data/topic_analysis_test.csv
									
									
									
									
									
										Обычный файл
									
								
							| @ -0,0 +1,190 @@ | |||||||
|  | text_english | ||||||
|  | Mercury: Retrograde | ||||||
|  | Pathology | ||||||
|  | Symbiote | ||||||
|  | ProductOfDrugs (Prod. The Virus and Antidote) | ||||||
|  | Venom | ||||||
|  | Gatteka | ||||||
|  | kamikaze (+ pulse) | ||||||
|  | T.R.U. (Totally Rotten Underground) | ||||||
|  | I Put My Dick in Your Mental | ||||||
|  | Andromeda | ||||||
|  | BRAINFOOD | ||||||
|  | Troll Under the Bridge | ||||||
|  | 1000 Rounds | ||||||
|  | Sacrifice | ||||||
|  | Backpack | ||||||
|  | D(R)Own | ||||||
|  | "Okay | ||||||
|  | TakingOutTheTrash | ||||||
|  | Io sono qui | ||||||
|  | Paris | ||||||
|  | Murder | ||||||
|  | High 'N Mighty | ||||||
|  | Euronymous | ||||||
|  | Hades | ||||||
|  | Nails | ||||||
|  | Squeeze | ||||||
|  | No Teeth | ||||||
|  | Bang Ya Fucking Head | ||||||
|  | BLUE JUICE | ||||||
|  | Loch Ness | ||||||
|  | Hold Uh | ||||||
|  | Bone Saw | ||||||
|  | Coffin Wave | ||||||
|  | OhNo! | ||||||
|  | TheArtOfCremation | ||||||
|  | OakGroveRoad | ||||||
|  | WhatWasThat | ||||||
|  | FunnyToSeeYouHere | ||||||
|  | John Dee | ||||||
|  | Kybalion | ||||||
|  | Killer | ||||||
|  | 608 | ||||||
|  | Eternal Dreams | ||||||
|  | Nightmare Choir (I Been Asleep Too Long) | ||||||
|  | Exodus | ||||||
|  | Vengeance | ||||||
|  | Claustrophobia | ||||||
|  | Rearranged | ||||||
|  | Paralax | ||||||
|  | Exsanguination | ||||||
|  | Mutiny | ||||||
|  | Centipede | ||||||
|  | Грустная сука | ||||||
|  | This World Is Sick | ||||||
|  | Пламя | ||||||
|  | 2:45 | ||||||
|  | who is he | ||||||
|  | Sleeping | ||||||
|  | Timeless | ||||||
|  | Pound for Pound | ||||||
|  | Finger Trembling | ||||||
|  | Overload | ||||||
|  | Kill Yourself (Part III) | ||||||
|  | 2nd Hand | ||||||
|  | Antarctica | ||||||
|  | Memoirs Of A Gorilla | ||||||
|  | Runnin' Thru The 7th With My Woadies | ||||||
|  | Mount Sinai | ||||||
|  | FUCKTHEPOPULATION | ||||||
|  | Magazine | ||||||
|  | 2 Hot 4 U (feat. $Uicdeboy$) | ||||||
|  | O Pana! | ||||||
|  | LTE | ||||||
|  | Champion Of Death | ||||||
|  | Seppuku (feat. Suicideboy$ & Jgrxxn) | ||||||
|  | You're Now Tuning Into 66.6 FM With DJ Rapture (The Hottest Hour Of The Evening) | ||||||
|  | Slip On A Banana Clip | ||||||
|  | A Death In The Ocean Would Be Beautiful | ||||||
|  | Shattered Amethyst | ||||||
|  | Goosebumps | ||||||
|  | Venom | ||||||
|  | Bury Me | ||||||
|  | Hack Slash | ||||||
|  | 2000 Rounds | ||||||
|  | Sea Sick | ||||||
|  | Grain | ||||||
|  | "Beware | ||||||
|  | Kali Yuga | ||||||
|  | Hexada | ||||||
|  | Caligula | ||||||
|  | Niagara (feat. Lil Peep) | ||||||
|  | Scrying Through Shattered Glass | ||||||
|  | Polaris | ||||||
|  | Rapture | ||||||
|  | Blackmage | ||||||
|  | Tartarus | ||||||
|  | Until the Light Takes Us | ||||||
|  | As Above so Look out Below | ||||||
|  | Swan | ||||||
|  | Sneak Diss (feat. So6ix) | ||||||
|  | Plague Doctor Mask | ||||||
|  | Some of Us May Never See the World | ||||||
|  | Filth | ||||||
|  | Homecoming | ||||||
|  | Blood | ||||||
|  | Sweat | ||||||
|  | Tears | ||||||
|  | Anabolic | ||||||
|  | HDMI | ||||||
|  | Dirt | ||||||
|  | Oxygen | ||||||
|  | Branches | ||||||
|  | CtrlAltDelete | ||||||
|  | BlastZone (ЗонаПоражения) | ||||||
|  | CharacterSelect (ВыборПерсонажа) | ||||||
|  | RestInPeace (Prod. by The Virus And Antidote) | ||||||
|  | BlackMold | ||||||
|  | Toxin | ||||||
|  | Electric | ||||||
|  | Cranium | ||||||
|  | Friday | ||||||
|  | Hooky | ||||||
|  | Kalaxian Crystals | ||||||
|  | Slurp | ||||||
|  | BROKE ft. Prohibeo | ||||||
|  | Lies | ||||||
|  | Terry McGinnis | ||||||
|  | Gremlin | ||||||
|  | Giant Squit | ||||||
|  | You Are Not Like Us | ||||||
|  | Arachnids | ||||||
|  | Give Ah Fuck | ||||||
|  | Death Wish | ||||||
|  | Allergies | ||||||
|  | Cut Throat | ||||||
|  | Memoirs of a Gorilla | ||||||
|  | Benz Truck (гелик) | ||||||
|  | Norf Norf | ||||||
|  | Dat $tick | ||||||
|  | "RAF (feat. A$AP Rocky | ||||||
|  | Crazy | ||||||
|  | Still Cold / Pathway Private | ||||||
|  | The Chills | ||||||
|  | Slip on a Banana Clip | ||||||
|  | Lights | ||||||
|  | Akina Speed Star | ||||||
|  | Big Fish | ||||||
|  | The Bodies Fall Just Like the Leaves | ||||||
|  | Story: No Title | ||||||
|  | P.S Fuck You Cunt (feat. Lil Peep) | ||||||
|  | Torch | ||||||
|  | "Buff Squad (feat. Pouya | ||||||
|  | Sarcophagus III (feat. $Uicideboy$) | ||||||
|  | Virginia Tech | ||||||
|  | Lte | ||||||
|  | Fuckthepopulation | ||||||
|  | Gloss of Blood | ||||||
|  | 100K | ||||||
|  | Dark Light | ||||||
|  | "But Wait | ||||||
|  | Great Influence | ||||||
|  | It Don't Matter | ||||||
|  | absolute in doubt | ||||||
|  | Boss | ||||||
|  | Look at Me Now | ||||||
|  | Bulletproof | ||||||
|  | Contraband | ||||||
|  | Deira City Centre | ||||||
|  | Kyoto | ||||||
|  | Pull Out Game | ||||||
|  | Bird Is The Word | ||||||
|  | Life Is Short | ||||||
|  | Here We Go Again | ||||||
|  | Bloodshed | ||||||
|  | Wassup Bro! | ||||||
|  | ACT 2 - BirthOfTheSpaceGod | ||||||
|  | Grey Tee | ||||||
|  | Sleeping Bag | ||||||
|  | Afterlife | ||||||
|  | King Cobra (Drippin') | ||||||
|  | Heart Attack | ||||||
|  | Chain$Aw | ||||||
|  | "King | ||||||
|  | P.T.S.D | ||||||
|  | Brand New | ||||||
|  | Jukai | ||||||
|  | Philosopher's Throne | ||||||
|  | PRBLMS | ||||||
|  | Back At It | ||||||
| Не удается отобразить этот файл, потому что он содержит неожиданный символ в строке 88 и столбце 1. | 
| @ -2,6 +2,8 @@ import os | |||||||
| import pytest | import pytest | ||||||
| import spacy | import spacy | ||||||
| import misinformation.text as tt | import misinformation.text as tt | ||||||
|  | import misinformation | ||||||
|  | import pandas as pd | ||||||
| 
 | 
 | ||||||
| TESTDICT = { | TESTDICT = { | ||||||
|     "IMG_3755": { |     "IMG_3755": { | ||||||
| @ -29,7 +31,6 @@ def test_TextDetector(): | |||||||
|         assert test_obj.subdict["text_language"] is None |         assert test_obj.subdict["text_language"] is None | ||||||
|         assert test_obj.subdict["text_english"] is None |         assert test_obj.subdict["text_english"] is None | ||||||
|         assert not test_obj.analyse_text |         assert not test_obj.analyse_text | ||||||
|         assert not test_obj.analyse_topic |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.gcv | @pytest.mark.gcv | ||||||
| @ -39,7 +40,6 @@ def test_analyse_image(): | |||||||
|         test_obj.analyse_image() |         test_obj.analyse_image() | ||||||
|         test_obj = tt.TextDetector(TESTDICT[item], analyse_text=True) |         test_obj = tt.TextDetector(TESTDICT[item], analyse_text=True) | ||||||
|         test_obj.analyse_image() |         test_obj.analyse_image() | ||||||
|         test_obj = tt.TextDetector(TESTDICT[item], analyse_topic=True) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.gcv | @pytest.mark.gcv | ||||||
| @ -68,6 +68,15 @@ def test_translate_text(): | |||||||
|         assert test_obj.subdict["text_english"] == translated_text |         assert test_obj.subdict["text_english"] == translated_text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_remove_linebreaks(): | ||||||
|  |     test_obj = tt.TextDetector({}) | ||||||
|  |     test_obj.subdict["text"] = "This is \n a test." | ||||||
|  |     test_obj.subdict["text_english"] = "This is \n another\n test." | ||||||
|  |     test_obj.remove_linebreaks() | ||||||
|  |     assert test_obj.subdict["text"] == "This is   a test." | ||||||
|  |     assert test_obj.subdict["text_english"] == "This is   another  test." | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_run_spacy(): | def test_run_spacy(): | ||||||
|     test_obj = tt.TextDetector(TESTDICT["IMG_3755"], analyse_text=True) |     test_obj = tt.TextDetector(TESTDICT["IMG_3755"], analyse_text=True) | ||||||
|     ref_file = "./test/data/text_IMG_3755.txt" |     ref_file = "./test/data/text_IMG_3755.txt" | ||||||
| @ -106,3 +115,34 @@ def test_sentiment_analysis(): | |||||||
|     test_obj.sentiment_analysis() |     test_obj.sentiment_analysis() | ||||||
|     assert test_obj.subdict["polarity"] == 0.5 |     assert test_obj.subdict["polarity"] == 0.5 | ||||||
|     assert test_obj.subdict["subjectivity"] == 0.6 |     assert test_obj.subdict["subjectivity"] == 0.6 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_PostprocessText(): | ||||||
|  |     reference_dict = "THE\nALGEBRAIC\nEIGENVALUE\nPROBLEM\nDOM\nNVS TIO\nMINA\nMonographs\non Numerical Analysis\nJ.. H. WILKINSON" | ||||||
|  |     reference_df = "Mathematische Formelsammlung\nfür Ingenieure und Naturwissenschaftler\nMit zahlreichen Abbildungen und Rechenbeispielen\nund einer ausführlichen Integraltafel\n3., verbesserte Auflage" | ||||||
|  |     obj = tt.PostprocessText(mydict=TESTDICT) | ||||||
|  |     # make sure test works on windows where end-of-line character is \r\n | ||||||
|  |     test_dict = obj.list_text_english[2].replace("\r", "") | ||||||
|  |     assert test_dict == reference_dict | ||||||
|  |     for key in TESTDICT.keys(): | ||||||
|  |         TESTDICT[key].pop("text_english") | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         tt.PostprocessText(mydict=TESTDICT) | ||||||
|  |     obj = tt.PostprocessText(use_csv=True, csv_path="./test/data/test_data_out.csv") | ||||||
|  |     # make sure test works on windows where end-of-line character is \r\n | ||||||
|  |     test_df = obj.list_text_english[0].replace("\r", "") | ||||||
|  |     assert test_df == reference_df | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         tt.PostprocessText(use_csv=True, csv_path="./test/data/test_data_out_nokey.csv") | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         tt.PostprocessText() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_analyse_topic(): | ||||||
|  |     _, topic_df, most_frequent_topics = tt.PostprocessText( | ||||||
|  |         use_csv=True, csv_path="./test/data/topic_analysis_test.csv" | ||||||
|  |     ).analyse_topic() | ||||||
|  |     # since this is not deterministic we cannot be sure we get the same result twice | ||||||
|  |     assert len(topic_df) == 2 | ||||||
|  |     assert topic_df["Name"].iloc[0] == "0_the_feat_of_is" | ||||||
|  |     assert most_frequent_topics[0][0][0] == "the" | ||||||
|  | |||||||
| @ -6,6 +6,9 @@ from textblob import TextBlob | |||||||
| from textblob import download_corpora | from textblob import download_corpora | ||||||
| import io | import io | ||||||
| from misinformation import utils | from misinformation import utils | ||||||
|  | import grpc | ||||||
|  | import pandas as pd | ||||||
|  | from bertopic import BERTopic | ||||||
| 
 | 
 | ||||||
| # make widgets work again | # make widgets work again | ||||||
| # clean text has weird spaces and separation of "do n't" | # clean text has weird spaces and separation of "do n't" | ||||||
| @ -13,14 +16,11 @@ from misinformation import utils | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class TextDetector(utils.AnalysisMethod): | class TextDetector(utils.AnalysisMethod): | ||||||
|     def __init__( |     def __init__(self, subdict: dict, analyse_text: bool = False) -> None: | ||||||
|         self, subdict: dict, analyse_text: bool = False, analyse_topic: bool = False |  | ||||||
|     ) -> None: |  | ||||||
|         super().__init__(subdict) |         super().__init__(subdict) | ||||||
|         self.subdict.update(self.set_keys()) |         self.subdict.update(self.set_keys()) | ||||||
|         self.translator = Translator() |         self.translator = Translator() | ||||||
|         self.analyse_text = analyse_text |         self.analyse_text = analyse_text | ||||||
|         self.analyse_topic = analyse_topic |  | ||||||
|         if self.analyse_text: |         if self.analyse_text: | ||||||
|             self._initialize_spacy() |             self._initialize_spacy() | ||||||
|             self._initialize_textblob() |             self._initialize_textblob() | ||||||
| @ -46,13 +46,12 @@ class TextDetector(utils.AnalysisMethod): | |||||||
|     def analyse_image(self): |     def analyse_image(self): | ||||||
|         self.get_text_from_image() |         self.get_text_from_image() | ||||||
|         self.translate_text() |         self.translate_text() | ||||||
|  |         self.remove_linebreaks() | ||||||
|         if self.analyse_text: |         if self.analyse_text: | ||||||
|             self._run_spacy() |             self._run_spacy() | ||||||
|             self.clean_text() |             self.clean_text() | ||||||
|             self.correct_spelling() |             self.correct_spelling() | ||||||
|             self.sentiment_analysis() |             self.sentiment_analysis() | ||||||
|         if self.analyse_topic: |  | ||||||
|             self.analyse_topic() |  | ||||||
|         return self.subdict |         return self.subdict | ||||||
| 
 | 
 | ||||||
|     def get_text_from_image(self): |     def get_text_from_image(self): | ||||||
| @ -62,12 +61,19 @@ class TextDetector(utils.AnalysisMethod): | |||||||
|         with io.open(path, "rb") as image_file: |         with io.open(path, "rb") as image_file: | ||||||
|             content = image_file.read() |             content = image_file.read() | ||||||
|         image = vision.Image(content=content) |         image = vision.Image(content=content) | ||||||
|  |         # check for usual connection errors and retry if necessary | ||||||
|  |         try: | ||||||
|             response = client.text_detection(image=image) |             response = client.text_detection(image=image) | ||||||
|  |         except grpc.RpcError as exc: | ||||||
|  |             print("Cloud vision API connection failed") | ||||||
|  |             print("Skipping this image ..{}".format(path)) | ||||||
|  |             print("Connection failed with code {}: {}".format(exc.code(), exc)) | ||||||
|  |         # here check if text was found on image | ||||||
|  |         if response: | ||||||
|             texts = response.text_annotations[0].description |             texts = response.text_annotations[0].description | ||||||
|         # here check if text was found |  | ||||||
|         if texts: |  | ||||||
|             self.subdict["text"] = texts |             self.subdict["text"] = texts | ||||||
|         if response.error.message: |         if response.error.message: | ||||||
|  |             print("Google Cloud Vision Error") | ||||||
|             raise ValueError( |             raise ValueError( | ||||||
|                 "{}\nFor more info on error messages, check: " |                 "{}\nFor more info on error messages, check: " | ||||||
|                 "https://cloud.google.com/apis/design/errors".format( |                 "https://cloud.google.com/apis/design/errors".format( | ||||||
| @ -80,6 +86,14 @@ class TextDetector(utils.AnalysisMethod): | |||||||
|         self.subdict["text_language"] = translated.src |         self.subdict["text_language"] = translated.src | ||||||
|         self.subdict["text_english"] = translated.text |         self.subdict["text_english"] = translated.text | ||||||
| 
 | 
 | ||||||
|  |     def remove_linebreaks(self): | ||||||
|  |         """Remove linebreaks from original and translated text.""" | ||||||
|  |         if self.subdict["text"]: | ||||||
|  |             self.subdict["text"] = self.subdict["text"].replace("\n", " ") | ||||||
|  |             self.subdict["text_english"] = self.subdict["text_english"].replace( | ||||||
|  |                 "\n", " " | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|     def _run_spacy(self): |     def _run_spacy(self): | ||||||
|         """Generate spacy doc object.""" |         """Generate spacy doc object.""" | ||||||
|         self.doc = self.nlp(self.subdict["text_english"]) |         self.doc = self.nlp(self.subdict["text_english"]) | ||||||
| @ -105,5 +119,73 @@ class TextDetector(utils.AnalysisMethod): | |||||||
|         # where 0.0 is very objective and 1.0 is very subjective |         # where 0.0 is very objective and 1.0 is very subjective | ||||||
|         self.subdict["subjectivity"] = self.doc._.blob.subjectivity |         self.subdict["subjectivity"] = self.doc._.blob.subjectivity | ||||||
| 
 | 
 | ||||||
|     def analyse_topic(self): | 
 | ||||||
|         pass | class PostprocessText: | ||||||
|  |     def __init__( | ||||||
|  |         self, mydict: dict = None, use_csv: bool = False, csv_path: str = None | ||||||
|  |     ) -> None: | ||||||
|  |         self.use_csv = use_csv | ||||||
|  |         if mydict: | ||||||
|  |             print("Reading data from dict.") | ||||||
|  |             self.mydict = mydict | ||||||
|  |             self.list_text_english = self.get_text_dict() | ||||||
|  |         elif self.use_csv: | ||||||
|  |             print("Reading data from df.") | ||||||
|  |             self.df = pd.read_csv(csv_path, encoding="utf8") | ||||||
|  |             self.list_text_english = self.get_text_df() | ||||||
|  |         else: | ||||||
|  |             raise ValueError( | ||||||
|  |                 "Please provide either dictionary with textual data or \ | ||||||
|  |                               a csv file by setting `use_csv` to True and providing a \ | ||||||
|  |                              `csv_path`." | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |     def analyse_topic(self, return_topics: int = 3): | ||||||
|  |         """Topic analysis using BERTopic.""" | ||||||
|  |         # load spacy pipeline | ||||||
|  |         nlp = spacy.load( | ||||||
|  |             "en_core_web_md", | ||||||
|  |             exclude=["tagger", "parser", "ner", "attribute_ruler", "lemmatizer"], | ||||||
|  |         ) | ||||||
|  |         try: | ||||||
|  |             # unfortunately catching exceptions does not work here - need to figure out why | ||||||
|  |             self.topic_model = BERTopic(embedding_model=nlp) | ||||||
|  |         except TypeError: | ||||||
|  |             print("BERTopic excited with an error - maybe your dataset is too small?") | ||||||
|  |         self.topics, self.probs = self.topic_model.fit_transform(self.list_text_english) | ||||||
|  |         # return the topic list | ||||||
|  |         topic_df = self.topic_model.get_topic_info() | ||||||
|  |         # return the most frequent return_topics | ||||||
|  |         most_frequent_topics = [] | ||||||
|  |         if len(topic_df) < return_topics: | ||||||
|  |             print("You requested more topics than are identified in your dataset -") | ||||||
|  |             print( | ||||||
|  |                 "Returning only {} topics as these are all that have been found.".format( | ||||||
|  |                     len(topic_df) | ||||||
|  |                 ) | ||||||
|  |             ) | ||||||
|  |         for i in range(min(return_topics, len(topic_df))): | ||||||
|  |             most_frequent_topics.append(self.topic_model.get_topic(i)) | ||||||
|  |         return self.topic_model, topic_df, most_frequent_topics | ||||||
|  | 
 | ||||||
|  |     def get_text_dict(self): | ||||||
|  |         # use dict to put text_english in list | ||||||
|  |         list_text_english = [] | ||||||
|  |         for key in self.mydict.keys(): | ||||||
|  |             if "text_english" not in self.mydict[key]: | ||||||
|  |                 raise ValueError( | ||||||
|  |                     "Please check your provided dictionary - \ | ||||||
|  |                 no english text data found." | ||||||
|  |                 ) | ||||||
|  |             list_text_english.append(self.mydict[key]["text_english"]) | ||||||
|  |         return list_text_english | ||||||
|  | 
 | ||||||
|  |     def get_text_df(self): | ||||||
|  |         # use csv file to obtain dataframe and put text_english in list | ||||||
|  |         # check that "text_english" is there | ||||||
|  |         if "text_english" not in self.df: | ||||||
|  |             raise ValueError( | ||||||
|  |                 "Please check your provided dataframe - \ | ||||||
|  |                                 no english text data found." | ||||||
|  |             ) | ||||||
|  |         return self.df["text_english"].tolist() | ||||||
|  | |||||||
							
								
								
									
										157
									
								
								notebooks/get-text-from-image.ipynb
									
									
									
										сгенерированный
									
									
									
								
							
							
						
						
									
										157
									
								
								notebooks/get-text-from-image.ipynb
									
									
									
										сгенерированный
									
									
									
								
							| @ -42,7 +42,18 @@ | |||||||
|     "import os\n", |     "import os\n", | ||||||
|     "from IPython.display import Image, display\n", |     "from IPython.display import Image, display\n", | ||||||
|     "import misinformation\n", |     "import misinformation\n", | ||||||
|  |     "import tensorflow as tf\n", | ||||||
|     "\n", |     "\n", | ||||||
|  |     "print(tf.config.list_physical_devices(\"GPU\"))" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": null, | ||||||
|  |    "id": "27675810", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|     "# download the models if they are not there yet\n", |     "# download the models if they are not there yet\n", | ||||||
|     "!python -m spacy download en_core_web_md\n", |     "!python -m spacy download en_core_web_md\n", | ||||||
|     "!python -m textblob.download_corpora" |     "!python -m textblob.download_corpora" | ||||||
| @ -55,9 +66,7 @@ | |||||||
|    "metadata": {}, |    "metadata": {}, | ||||||
|    "outputs": [], |    "outputs": [], | ||||||
|    "source": [ |    "source": [ | ||||||
|     "images = misinformation.find_files(\n", |     "images = misinformation.find_files(path=\"../data/all/\", limit=1000)" | ||||||
|     "    path=\"drive/MyDrive/misinformation-data/\", limit=1000\n", |  | ||||||
|     ")" |  | ||||||
|    ] |    ] | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
| @ -78,7 +87,7 @@ | |||||||
|    "metadata": {}, |    "metadata": {}, | ||||||
|    "outputs": [], |    "outputs": [], | ||||||
|    "source": [ |    "source": [ | ||||||
|     "mydict = misinformation.utils.initialize_dict(images[0:10])" |     "mydict = misinformation.utils.initialize_dict(images[0:3])" | ||||||
|    ] |    ] | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
| @ -99,7 +108,7 @@ | |||||||
|    "source": [ |    "source": [ | ||||||
|     "os.environ[\n", |     "os.environ[\n", | ||||||
|     "    \"GOOGLE_APPLICATION_CREDENTIALS\"\n", |     "    \"GOOGLE_APPLICATION_CREDENTIALS\"\n", | ||||||
|     "] = \"drive/MyDrive/misinformation-data/misinformation-campaign-981aa55a3b13.json\"" |     "] = \"../data/misinformation-campaign-981aa55a3b13.json\"" | ||||||
|    ] |    ] | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
| @ -180,13 +189,143 @@ | |||||||
|    "outputs": [], |    "outputs": [], | ||||||
|    "source": [ |    "source": [ | ||||||
|     "# Write the csv\n", |     "# Write the csv\n", | ||||||
|     "df.to_csv(\"drive/MyDrive/misinformation-data/data_out.csv\")" |     "df.to_csv(\"./data_out.csv\")" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "markdown", | ||||||
|  |    "id": "4bc8ac0a", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "source": [ | ||||||
|  |     "# Topic analysis\n", | ||||||
|  |     "The topic analysis is carried out using [BERTopic](https://maartengr.github.io/BERTopic/index.html) using an embedded model through a [spaCy](https://spacy.io/) pipeline." | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "markdown", | ||||||
|  |    "id": "4931941b", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "source": [ | ||||||
|  |     "BERTopic takes a list of strings as input. The more items in the list, the better for the topic modeling. If the below returns an error for `analyse_topic()`, the reason can be that your dataset is too small.\n", | ||||||
|  |     "### Option 1: Use the dictionary as obtained from the above analysis." | ||||||
|    ] |    ] | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|    "cell_type": "code", |    "cell_type": "code", | ||||||
|    "execution_count": null, |    "execution_count": null, | ||||||
|    "id": "568537df", |    "id": "a3450a61", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "# make a list of all the text_english entries per analysed image from the mydict variable as above\n", | ||||||
|  |     "topic_model, topic_df, most_frequent_topics = misinformation.text.PostprocessText(\n", | ||||||
|  |     "    mydict=mydict\n", | ||||||
|  |     ").analyse_topic()" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "markdown", | ||||||
|  |    "id": "95667342", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "source": [ | ||||||
|  |     "### Option 2: Read in a csv\n", | ||||||
|  |     "Not to analyse too many images on google Cloud Vision, use the csv output to obtain the text (when rerunning already analysed images)." | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": null, | ||||||
|  |    "id": "5530e436", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "input_file_path = \"data_out.csv\"\n", | ||||||
|  |     "topic_model, topic_df, most_frequent_topics = misinformation.text.PostprocessText(\n", | ||||||
|  |     "    use_csv=True, csv_path=input_file_path\n", | ||||||
|  |     ").analyse_topic(return_topics=10)" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "markdown", | ||||||
|  |    "id": "0b6ef6d7", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "source": [ | ||||||
|  |     "### Access frequent topics\n", | ||||||
|  |     "A topic of `-1` stands for an outlier and should be ignored. Topic count is the number of occurence of that topic. The output is structured from most frequent to least frequent topic." | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": null, | ||||||
|  |    "id": "43288cda-61bb-4ff1-a209-dcfcc4916b1f", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "print(topic_df)" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "markdown", | ||||||
|  |    "id": "b3316770", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "source": [ | ||||||
|  |     "### Get information for specific topic\n", | ||||||
|  |     "The most frequent topics can be accessed through `most_frequent_topics` with the most occuring topics first in the list." | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": null, | ||||||
|  |    "id": "db14fe03", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "for topic in most_frequent_topics:\n", | ||||||
|  |     "    print(\"Topic:\", topic)" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "markdown", | ||||||
|  |    "id": "d10f701e", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "source": [ | ||||||
|  |     "### Topic visualization\n", | ||||||
|  |     "The topics can also be visualized. Careful: This only works if there is sufficient data (quantity and quality)." | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": null, | ||||||
|  |    "id": "2331afe6", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "topic_model.visualize_topics()" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "markdown", | ||||||
|  |    "id": "f4eaf353", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "source": [ | ||||||
|  |     "### Save the model\n", | ||||||
|  |     "The model can be saved for future use." | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": null, | ||||||
|  |    "id": "e5e8377c", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "topic_model.save(\"misinfo_posts\")" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": null, | ||||||
|  |    "id": "7c94edb9", | ||||||
|    "metadata": {}, |    "metadata": {}, | ||||||
|    "outputs": [], |    "outputs": [], | ||||||
|    "source": [] |    "source": [] | ||||||
| @ -194,7 +333,7 @@ | |||||||
|  ], |  ], | ||||||
|  "metadata": { |  "metadata": { | ||||||
|   "kernelspec": { |   "kernelspec": { | ||||||
|    "display_name": "misinf", |    "display_name": "Python 3 (ipykernel)", | ||||||
|    "language": "python", |    "language": "python", | ||||||
|    "name": "python3" |    "name": "python3" | ||||||
|   }, |   }, | ||||||
| @ -208,7 +347,7 @@ | |||||||
|    "name": "python", |    "name": "python", | ||||||
|    "nbconvert_exporter": "python", |    "nbconvert_exporter": "python", | ||||||
|    "pygments_lexer": "ipython3", |    "pygments_lexer": "ipython3", | ||||||
|    "version": "3.10.6 (main, Oct 24 2022, 16:07:47) [GCC 11.2.0]" |    "version": "3.10.6" | ||||||
|   }, |   }, | ||||||
|   "vscode": { |   "vscode": { | ||||||
|    "interpreter": { |    "interpreter": { | ||||||
|  | |||||||
| @ -48,6 +48,8 @@ dependencies = [ | |||||||
|     "textblob", |     "textblob", | ||||||
|     "torch", |     "torch", | ||||||
|     "salesforce-lavis @ git+https://github.com/salesforce/LAVIS.git@main" |     "salesforce-lavis @ git+https://github.com/salesforce/LAVIS.git@main" | ||||||
|  |     "bertopic", | ||||||
|  |     "grpcio", | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| [project.scripts] | [project.scripts] | ||||||
|  | |||||||
| @ -24,3 +24,5 @@ spacytextblob | |||||||
| textblob | textblob | ||||||
| git+https://github.com/sloria/TextBlob.git@dev | git+https://github.com/sloria/TextBlob.git@dev | ||||||
| git+https://github.com/salesforce/LAVIS.git@main | git+https://github.com/salesforce/LAVIS.git@main | ||||||
|  | bertopic | ||||||
|  | grpcio | ||||||
|  | |||||||
		Загрузка…
	
	
			
			x
			
			
		
	
		Ссылка в новой задаче
	
	Block a user
	 Inga Ulusoy
						Inga Ulusoy