From 8eb4fca75eca3f9e07fb55b8b87f99b95bf34392 Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Thu, 31 Aug 2023 13:54:51 +0200 Subject: [PATCH] put clean text back in, correct default image data path (#145) --- ammico/test/test_text.py | 21 +++++++++++++++++++++ ammico/text.py | 25 +++++++++++++++++++++++++ ammico/utils.py | 4 ++-- 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/ammico/test/test_text.py b/ammico/test/test_text.py index dd0f9f1..53a51e8 100644 --- a/ammico/test/test_text.py +++ b/ammico/test/test_text.py @@ -1,5 +1,6 @@ import pytest import ammico.text as tt +import spacy @pytest.fixture @@ -30,6 +31,26 @@ def test_TextDetector(set_testdict): assert not test_obj.analyse_text +def test_run_spacy(set_testdict, get_path): + test_obj = tt.TextDetector(set_testdict["IMG_3755"], analyse_text=True) + ref_file = get_path + "text_IMG_3755.txt" + with open(ref_file, "r") as file: + reference_text = file.read() + test_obj.subdict["text_english"] = reference_text + test_obj._run_spacy() + assert isinstance(test_obj.doc, spacy.tokens.doc.Doc) + + +def test_clean_text(set_testdict): + nlp = spacy.load("en_core_web_md") + doc = nlp("I like cats and fjejg") + test_obj = tt.TextDetector(set_testdict["IMG_3755"]) + test_obj.doc = doc + test_obj.clean_text() + result = "I like cats and" + assert test_obj.subdict["text_clean"] == result + + def test_init_revision_numbers_and_models(): test_obj = tt.TextDetector({}) # check the default options diff --git a/ammico/text.py b/ammico/text.py index 1964998..d4279d0 100644 --- a/ammico/text.py +++ b/ammico/text.py @@ -45,6 +45,8 @@ class TextDetector(AnalysisMethod): if not isinstance(analyse_text, bool): raise ValueError("analyse_text needs to be set to true or false") self.analyse_text = analyse_text + if self.analyse_text: + self._initialize_spacy() if model_names: self._check_valid_models(model_names) if revision_numbers: @@ -139,6 +141,14 @@ class TextDetector(AnalysisMethod): params = {"text": None, "text_language": None, "text_english": None} return params + def _initialize_spacy(self): + """Initialize the Spacy library for text analysis.""" + try: + self.nlp = spacy.load("en_core_web_md") + except Exception: + spacy.cli.download("en_core_web_md") + self.nlp = spacy.load("en_core_web_md") + def analyse_image(self) -> dict: """Perform text extraction and analysis of the text. @@ -149,6 +159,8 @@ class TextDetector(AnalysisMethod): self.translate_text() self.remove_linebreaks() if self.analyse_text: + self._run_spacy() + self.clean_text() self.text_summary() self.text_sentiment_transformers() self.text_ner() @@ -200,6 +212,19 @@ class TextDetector(AnalysisMethod): "\n", " " ) + def _run_spacy(self): + """Generate Spacy doc object for further text analysis.""" + self.doc = self.nlp(self.subdict["text_english"]) + + def clean_text(self): + """Clean the text from unrecognized words and any numbers.""" + templist = [] + for token in self.doc: + templist.append( + token.text + ) if token.pos_ != "NUM" and token.has_vector else None + self.subdict["text_clean"] = " ".join(templist).rstrip().lstrip() + def text_summary(self): """Generate a summary of the text using the Transformers pipeline.""" # use the transformers pipeline to summarize the text diff --git a/ammico/utils.py b/ammico/utils.py index 0a9a380..ffe66b6 100644 --- a/ammico/utils.py +++ b/ammico/utils.py @@ -105,7 +105,7 @@ def find_files( Args: path (str, optional): The base directory where we are looking for the images. Defaults - to None, which uses the XDG data directory if set or the current + to None, which uses the ammico data directory if set or the current working directory otherwise. pattern (str|list, optional): The naming pattern that the filename should match. Use either '.ext' or just 'ext' @@ -122,7 +122,7 @@ def find_files( """ if path is None: - path = os.environ.get("XDG_DATA_HOME", ".") + path = os.environ.get("AMMICO_DATA_HOME", ".") if isinstance(pattern, str): pattern = [pattern]