From 8eb4fca75eca3f9e07fb55b8b87f99b95bf34392 Mon Sep 17 00:00:00 2001
From: Inga Ulusoy <inga.ulusoy@uni-heidelberg.de>
Date: Thu, 31 Aug 2023 13:54:51 +0200
Subject: [PATCH] put clean text back in, correct default image data path
 (#145)

---
 ammico/test/test_text.py | 21 +++++++++++++++++++++
 ammico/text.py           | 25 +++++++++++++++++++++++++
 ammico/utils.py          |  4 ++--
 3 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/ammico/test/test_text.py b/ammico/test/test_text.py
index dd0f9f1..53a51e8 100644
--- a/ammico/test/test_text.py
+++ b/ammico/test/test_text.py
@@ -1,5 +1,6 @@
 import pytest
 import ammico.text as tt
+import spacy
 
 
 @pytest.fixture
@@ -30,6 +31,26 @@ def test_TextDetector(set_testdict):
         assert not test_obj.analyse_text
 
 
+def test_run_spacy(set_testdict, get_path):
+    test_obj = tt.TextDetector(set_testdict["IMG_3755"], analyse_text=True)
+    ref_file = get_path + "text_IMG_3755.txt"
+    with open(ref_file, "r") as file:
+        reference_text = file.read()
+    test_obj.subdict["text_english"] = reference_text
+    test_obj._run_spacy()
+    assert isinstance(test_obj.doc, spacy.tokens.doc.Doc)
+
+
+def test_clean_text(set_testdict):
+    nlp = spacy.load("en_core_web_md")
+    doc = nlp("I like cats and fjejg")
+    test_obj = tt.TextDetector(set_testdict["IMG_3755"])
+    test_obj.doc = doc
+    test_obj.clean_text()
+    result = "I like cats and"
+    assert test_obj.subdict["text_clean"] == result
+
+
 def test_init_revision_numbers_and_models():
     test_obj = tt.TextDetector({})
     # check the default options
diff --git a/ammico/text.py b/ammico/text.py
index 1964998..d4279d0 100644
--- a/ammico/text.py
+++ b/ammico/text.py
@@ -45,6 +45,8 @@ class TextDetector(AnalysisMethod):
         if not isinstance(analyse_text, bool):
             raise ValueError("analyse_text needs to be set to true or false")
         self.analyse_text = analyse_text
+        if self.analyse_text:
+            self._initialize_spacy()
         if model_names:
             self._check_valid_models(model_names)
         if revision_numbers:
@@ -139,6 +141,14 @@ class TextDetector(AnalysisMethod):
         params = {"text": None, "text_language": None, "text_english": None}
         return params
 
+    def _initialize_spacy(self):
+        """Initialize the Spacy library for text analysis."""
+        try:
+            self.nlp = spacy.load("en_core_web_md")
+        except Exception:
+            spacy.cli.download("en_core_web_md")
+            self.nlp = spacy.load("en_core_web_md")
+
     def analyse_image(self) -> dict:
         """Perform text extraction and analysis of the text.
 
@@ -149,6 +159,8 @@ class TextDetector(AnalysisMethod):
         self.translate_text()
         self.remove_linebreaks()
         if self.analyse_text:
+            self._run_spacy()
+            self.clean_text()
             self.text_summary()
             self.text_sentiment_transformers()
             self.text_ner()
@@ -200,6 +212,19 @@ class TextDetector(AnalysisMethod):
                 "\n", " "
             )
 
+    def _run_spacy(self):
+        """Generate Spacy doc object for further text analysis."""
+        self.doc = self.nlp(self.subdict["text_english"])
+
+    def clean_text(self):
+        """Clean the text from unrecognized words and any numbers."""
+        templist = []
+        for token in self.doc:
+            templist.append(
+                token.text
+            ) if token.pos_ != "NUM" and token.has_vector else None
+        self.subdict["text_clean"] = " ".join(templist).rstrip().lstrip()
+
     def text_summary(self):
         """Generate a summary of the text using the Transformers pipeline."""
         # use the transformers pipeline to summarize the text
diff --git a/ammico/utils.py b/ammico/utils.py
index 0a9a380..ffe66b6 100644
--- a/ammico/utils.py
+++ b/ammico/utils.py
@@ -105,7 +105,7 @@ def find_files(
 
     Args:
         path (str, optional): The base directory where we are looking for the images. Defaults
-            to None, which uses the XDG data directory if set or the current
+            to None, which uses the ammico data directory if set or the current
             working directory otherwise.
         pattern (str|list, optional): The naming pattern that the filename should match.
                 Use either '.ext' or just 'ext'
@@ -122,7 +122,7 @@ def find_files(
     """
 
     if path is None:
-        path = os.environ.get("XDG_DATA_HOME", ".")
+        path = os.environ.get("AMMICO_DATA_HOME", ".")
 
     if isinstance(pattern, str):
         pattern = [pattern]