AMMICO/misinformation/text.py

from google.cloud import vision
from googletrans import Translator
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
from textblob import TextBlob
import io
from misinformation import utils

# make widgets work again
# clean text has weird spaces and separation of "do n't"
# increase coverage for text


class TextDetector(utils.AnalysisMethod):
    def __init__(
        self, subdict: dict, analyse_text: bool = False, analyse_topic: bool = False
    ) -> None:
        super().__init__(subdict)
        self.subdict.update(self.set_keys())
        self.translator = Translator()
        self.analyse_text = analyse_text
        self.analyse_topic = analyse_topic
        if self.analyse_text:
            # spacy load should be separate method with error if model not found / dynamic download
            self.nlp = spacy.load("en_core_web_md")
            self.nlp.add_pipe("spacytextblob")

    def set_keys(self) -> dict:
        params = {
            "text": None,
            "text_language": None,
            "text_english": None,
            "text_cleaned": None,
        }
        return params

    def analyse_image(self):
        self.get_text_from_image()
        self.translate_text()
        if self.analyse_text:
            self._init_spacy()
            self.clean_text()
            self.correct_spelling()
            self.sentiment_analysis()
        if self.analyse_topic:
            self.analyse_topic()
        return self.subdict

    def get_text_from_image(self):
        """Detects text on the image."""
        path = self.subdict["filename"]
        client = vision.ImageAnnotatorClient()
        with io.open(path, "rb") as image_file:
            content = image_file.read()
        image = vision.Image(content=content)
        response = client.text_detection(image=image)
        texts = response.text_annotations[0].description
        # here check if text was found
        self.subdict = {"text": texts}
        if response.error.message:
            raise ValueError(
                "{}\nFor more info on error messages, check: "
                "https://cloud.google.com/apis/design/errors".format(
                    response.error.message
                )
            )

    def translate_text(self):
        translated = self.translator.translate(self.subdict["text"])
        self.subdict["text_language"] = translated.src
        self.subdict["text_english"] = translated.text

    def _init_spacy(self):
        """Generate spacy doc object."""
        self.doc = self.nlp(self.subdict["text_english"])

    def clean_text(self):
        """Clean the text from unrecognized words and any numbers."""
        templist = []
        for token in self.doc:
            templist.append(
                token.text
            ) if token.pos_ != "NUM" and token.has_vector else None
        self.subdict["text_clean"] = " ".join(templist).rstrip().lstrip()

    def correct_spelling(self):
        self.textblob = TextBlob(self.subdict["text_english"])
        self.subdict["text_english_correct"] = str(self.textblob.correct())

    def sentiment_analysis(self):
        # self.subdict["sentiment"] = self.doc._.blob.sentiment_assessments.assessments
        # polarity is between [-1.0, 1.0]
        self.subdict["polarity"] = self.doc._.blob.polarity
        # subjectivity is a float within the range [0.0, 1.0]
        # where 0.0 is very objective and 1.0 is very subjective
        self.subdict["subjectivity"] = self.doc._.blob.subjectivity

    def analyse_topic(self):
        pass