зеркало из
				https://github.com/ssciwr/AMMICO.git
				synced 2025-10-31 05:56:05 +02:00 
			
		
		
		
	 54728e02bb
			
		
	
	
		54728e02bb
		
			
		
	
	
	
	
		
			
			* update notebook * comments * add jupyterlab * add text analysis capability * add bool in tests * add dependencies and spelling test * add test sentiment * update black pre-commit dependency for native nb support * update black version, find better sentiment test * test analyse_image
		
			
				
	
	
		
			100 строки
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			100 строки
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from google.cloud import vision
 | |
| from googletrans import Translator
 | |
| import spacy
 | |
| from spacytextblob.spacytextblob import SpacyTextBlob
 | |
| from textblob import TextBlob
 | |
| import io
 | |
| from misinformation import utils
 | |
| 
 | |
| # make widgets work again
 | |
| # clean text has weird spaces and separation of "do n't"
 | |
| # increase coverage for text
 | |
| 
 | |
| 
 | |
| class TextDetector(utils.AnalysisMethod):
 | |
|     def __init__(
 | |
|         self, subdict: dict, analyse_text: bool = False, analyse_topic: bool = False
 | |
|     ) -> None:
 | |
|         super().__init__(subdict)
 | |
|         self.subdict.update(self.set_keys())
 | |
|         self.translator = Translator()
 | |
|         self.analyse_text = analyse_text
 | |
|         self.analyse_topic = analyse_topic
 | |
|         if self.analyse_text:
 | |
|             # spacy load should be separate method with error if model not found / dynamic download
 | |
|             self.nlp = spacy.load("en_core_web_md")
 | |
|             self.nlp.add_pipe("spacytextblob")
 | |
| 
 | |
|     def set_keys(self) -> dict:
 | |
|         params = {
 | |
|             "text": None,
 | |
|             "text_language": None,
 | |
|             "text_english": None,
 | |
|             "text_cleaned": None,
 | |
|         }
 | |
|         return params
 | |
| 
 | |
|     def analyse_image(self):
 | |
|         self.get_text_from_image()
 | |
|         self.translate_text()
 | |
|         if self.analyse_text:
 | |
|             self._init_spacy()
 | |
|             self.clean_text()
 | |
|             self.correct_spelling()
 | |
|             self.sentiment_analysis()
 | |
|         if self.analyse_topic:
 | |
|             self.analyse_topic()
 | |
|         return self.subdict
 | |
| 
 | |
|     def get_text_from_image(self):
 | |
|         """Detects text on the image."""
 | |
|         path = self.subdict["filename"]
 | |
|         client = vision.ImageAnnotatorClient()
 | |
|         with io.open(path, "rb") as image_file:
 | |
|             content = image_file.read()
 | |
|         image = vision.Image(content=content)
 | |
|         response = client.text_detection(image=image)
 | |
|         texts = response.text_annotations[0].description
 | |
|         # here check if text was found
 | |
|         self.subdict = {"text": texts}
 | |
|         if response.error.message:
 | |
|             raise ValueError(
 | |
|                 "{}\nFor more info on error messages, check: "
 | |
|                 "https://cloud.google.com/apis/design/errors".format(
 | |
|                     response.error.message
 | |
|                 )
 | |
|             )
 | |
| 
 | |
|     def translate_text(self):
 | |
|         translated = self.translator.translate(self.subdict["text"])
 | |
|         self.subdict["text_language"] = translated.src
 | |
|         self.subdict["text_english"] = translated.text
 | |
| 
 | |
|     def _init_spacy(self):
 | |
|         """Generate spacy doc object."""
 | |
|         self.doc = self.nlp(self.subdict["text_english"])
 | |
| 
 | |
|     def clean_text(self):
 | |
|         """Clean the text from unrecognized words and any numbers."""
 | |
|         templist = []
 | |
|         for token in self.doc:
 | |
|             templist.append(
 | |
|                 token.text
 | |
|             ) if token.pos_ != "NUM" and token.has_vector else None
 | |
|         self.subdict["text_clean"] = " ".join(templist).rstrip().lstrip()
 | |
| 
 | |
|     def correct_spelling(self):
 | |
|         self.textblob = TextBlob(self.subdict["text_english"])
 | |
|         self.subdict["text_english_correct"] = str(self.textblob.correct())
 | |
| 
 | |
|     def sentiment_analysis(self):
 | |
|         # self.subdict["sentiment"] = self.doc._.blob.sentiment_assessments.assessments
 | |
|         # polarity is between [-1.0, 1.0]
 | |
|         self.subdict["polarity"] = self.doc._.blob.polarity
 | |
|         # subjectivity is a float within the range [0.0, 1.0]
 | |
|         # where 0.0 is very objective and 1.0 is very subjective
 | |
|         self.subdict["subjectivity"] = self.doc._.blob.subjectivity
 | |
| 
 | |
|     def analyse_topic(self):
 | |
|         pass
 |