Inga Ulusoy 54728e02bb
Text cleanup and sentiment analysis (#49)
* update notebook

* comments

* add jupyterlab

* add text analysis capability

* add bool in tests

* add dependencies and spelling test

* add test sentiment

* update black pre-commit dependency for native nb support

* update black version, find better sentiment test

* test analyse_image
2023-01-11 12:58:02 +01:00

100 строки
3.5 KiB
Python

from google.cloud import vision
from googletrans import Translator
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
from textblob import TextBlob
import io
from misinformation import utils
# make widgets work again
# clean text has weird spaces and separation of "do n't"
# increase coverage for text
class TextDetector(utils.AnalysisMethod):
def __init__(
self, subdict: dict, analyse_text: bool = False, analyse_topic: bool = False
) -> None:
super().__init__(subdict)
self.subdict.update(self.set_keys())
self.translator = Translator()
self.analyse_text = analyse_text
self.analyse_topic = analyse_topic
if self.analyse_text:
# spacy load should be separate method with error if model not found / dynamic download
self.nlp = spacy.load("en_core_web_md")
self.nlp.add_pipe("spacytextblob")
def set_keys(self) -> dict:
params = {
"text": None,
"text_language": None,
"text_english": None,
"text_cleaned": None,
}
return params
def analyse_image(self):
self.get_text_from_image()
self.translate_text()
if self.analyse_text:
self._init_spacy()
self.clean_text()
self.correct_spelling()
self.sentiment_analysis()
if self.analyse_topic:
self.analyse_topic()
return self.subdict
def get_text_from_image(self):
"""Detects text on the image."""
path = self.subdict["filename"]
client = vision.ImageAnnotatorClient()
with io.open(path, "rb") as image_file:
content = image_file.read()
image = vision.Image(content=content)
response = client.text_detection(image=image)
texts = response.text_annotations[0].description
# here check if text was found
self.subdict = {"text": texts}
if response.error.message:
raise ValueError(
"{}\nFor more info on error messages, check: "
"https://cloud.google.com/apis/design/errors".format(
response.error.message
)
)
def translate_text(self):
translated = self.translator.translate(self.subdict["text"])
self.subdict["text_language"] = translated.src
self.subdict["text_english"] = translated.text
def _init_spacy(self):
"""Generate spacy doc object."""
self.doc = self.nlp(self.subdict["text_english"])
def clean_text(self):
"""Clean the text from unrecognized words and any numbers."""
templist = []
for token in self.doc:
templist.append(
token.text
) if token.pos_ != "NUM" and token.has_vector else None
self.subdict["text_clean"] = " ".join(templist).rstrip().lstrip()
def correct_spelling(self):
self.textblob = TextBlob(self.subdict["text_english"])
self.subdict["text_english_correct"] = str(self.textblob.correct())
def sentiment_analysis(self):
# self.subdict["sentiment"] = self.doc._.blob.sentiment_assessments.assessments
# polarity is between [-1.0, 1.0]
self.subdict["polarity"] = self.doc._.blob.polarity
# subjectivity is a float within the range [0.0, 1.0]
# where 0.0 is very objective and 1.0 is very subjective
self.subdict["subjectivity"] = self.doc._.blob.subjectivity
def analyse_topic(self):
pass