Inga Ulusoy b585097f19
remove obsolete features (#48)
* reduce code smells

* account for transl. version change

* remove print statements

* remove imageai
2022-12-31 22:15:40 +01:00

68 строки
2.3 KiB
Python

from google.cloud import vision
from googletrans import Translator
import spacy
import io
from misinformation import utils
class TextDetector(utils.AnalysisMethod):
def __init__(self, subdict: dict) -> None:
super().__init__(subdict)
self.subdict.update(self.set_keys())
self.translator = Translator()
# spacy load should be separaate method with error if model not found / dynamic download
self.nlp = spacy.load("en_core_web_md")
def set_keys(self) -> dict:
params = {
"text": None,
"text_language": None,
"text_english": None,
"text_cleaned": None,
}
return params
def analyse_image(self):
self.get_text_from_image()
self.translate_text()
self._init_spacy()
self.clean_text()
return self.subdict
def get_text_from_image(self):
"""Detects text on the image."""
path = self.subdict["filename"]
client = vision.ImageAnnotatorClient()
with io.open(path, "rb") as image_file:
content = image_file.read()
image = vision.Image(content=content)
response = client.text_detection(image=image)
texts = response.text_annotations[0].description
# here check if text was found
self.subdict = {"text": texts}
if response.error.message:
raise ValueError(
"{}\nFor more info on error messages, check: "
"https://cloud.google.com/apis/design/errors".format(
response.error.message
)
)
def translate_text(self):
translated = self.translator.translate(self.subdict["text"])
self.subdict["text_language"] = translated.src
self.subdict["text_english"] = translated.text
def _init_spacy(self):
"""Generate spacy doc object."""
self.doc = self.nlp(self.subdict["text_english"])
def clean_text(self):
"""Clean the text from unrecognized words and any numbers."""
templist = []
for token in self.doc:
templist.append(
token.text
) if token.pos_ != "NUM" and token.has_vector else None
self.subdict["text_clean"] = " ".join(templist).rstrip().lstrip()