put clean text back in, correct default image data path (#145)

2025-10-30 21:46:04 +02:00 · 2023-08-31 13:54:51 +02:00 · 2023-08-31 13:54:51 +02:00 · 8eb4fca75e
--- a/ammico/test/test_text.py
+++ b/ammico/test/test_text.py
@ -1,5 +1,6 @@
 import pytest
 import ammico.text as tt
+import spacy


@pytest.fixture
@ -30,6 +31,26 @@ def test_TextDetector(set_testdict):
        assert not test_obj.analyse_text


+def test_run_spacy(set_testdict, get_path):
+    test_obj = tt.TextDetector(set_testdict["IMG_3755"], analyse_text=True)
+    ref_file = get_path + "text_IMG_3755.txt"
+    with open(ref_file, "r") as file:
+        reference_text = file.read()
+    test_obj.subdict["text_english"] = reference_text
+    test_obj._run_spacy()
+    assert isinstance(test_obj.doc, spacy.tokens.doc.Doc)
+
+
+def test_clean_text(set_testdict):
+    nlp = spacy.load("en_core_web_md")
+    doc = nlp("I like cats and fjejg")
+    test_obj = tt.TextDetector(set_testdict["IMG_3755"])
+    test_obj.doc = doc
+    test_obj.clean_text()
+    result = "I like cats and"
+    assert test_obj.subdict["text_clean"] == result
+
+
 def test_init_revision_numbers_and_models():
    test_obj = tt.TextDetector({})
    # check the default options
--- a/ammico/text.py
+++ b/ammico/text.py
@ -45,6 +45,8 @@ class TextDetector(AnalysisMethod):
        if not isinstance(analyse_text, bool):
            raise ValueError("analyse_text needs to be set to true or false")
        self.analyse_text = analyse_text
+        if self.analyse_text:
+            self._initialize_spacy()
        if model_names:
            self._check_valid_models(model_names)
        if revision_numbers:
@ -139,6 +141,14 @@ class TextDetector(AnalysisMethod):
        params = {"text": None, "text_language": None, "text_english": None}
        return params

+    def _initialize_spacy(self):
+        """Initialize the Spacy library for text analysis."""
+        try:
+            self.nlp = spacy.load("en_core_web_md")
+        except Exception:
+            spacy.cli.download("en_core_web_md")
+            self.nlp = spacy.load("en_core_web_md")
+
    def analyse_image(self) -> dict:
        """Perform text extraction and analysis of the text.

@ -149,6 +159,8 @@ class TextDetector(AnalysisMethod):
        self.translate_text()
        self.remove_linebreaks()
        if self.analyse_text:
+            self._run_spacy()
+            self.clean_text()
            self.text_summary()
            self.text_sentiment_transformers()
            self.text_ner()
@ -200,6 +212,19 @@ class TextDetector(AnalysisMethod):
                "\n", " "
            )

+    def _run_spacy(self):
+        """Generate Spacy doc object for further text analysis."""
+        self.doc = self.nlp(self.subdict["text_english"])
+
+    def clean_text(self):
+        """Clean the text from unrecognized words and any numbers."""
+        templist = []
+        for token in self.doc:
+            templist.append(
+                token.text
+            ) if token.pos_ != "NUM" and token.has_vector else None
+        self.subdict["text_clean"] = " ".join(templist).rstrip().lstrip()
+
    def text_summary(self):
        """Generate a summary of the text using the Transformers pipeline."""
        # use the transformers pipeline to summarize the text
--- a/ammico/utils.py
+++ b/ammico/utils.py
@ -105,7 +105,7 @@ def find_files(

    Args:
        path (str, optional): The base directory where we are looking for the images. Defaults
-            to None, which uses the XDG data directory if set or the current
+            to None, which uses the ammico data directory if set or the current
            working directory otherwise.
        pattern (str|list, optional): The naming pattern that the filename should match.
                Use either '.ext' or just 'ext'
@ -122,7 +122,7 @@ def find_files(
    """

    if path is None:
-        path = os.environ.get("XDG_DATA_HOME", ".")
+        path = os.environ.get("AMMICO_DATA_HOME", ".")

    if isinstance(pattern, str):
        pattern = [pattern]