manage occurence of full stops in a better way (#229)

* manage occurence of full stops in a better way * bump version * cleanup
2025-10-29 21:16:06 +02:00 · 2024-12-02 15:01:50 +01:00 · 2024-12-02 15:01:50 +01:00 · e12929a909
--- a/ammico/test/test_text.py
+++ b/ammico/test/test_text.py
@ -141,6 +141,19 @@ def test_init_revision_numbers_and_models(accepted):
        tt.TextDetector({}, revision_numbers=["something"], accept_privacy=accepted)


+def test_check_add_space_after_full_stop(accepted):
+    test_obj = tt.TextDetector({}, accept_privacy=accepted)
+    test_obj.subdict["text"] = "I like cats. I like dogs."
+    test_obj._check_add_space_after_full_stop()
+    assert test_obj.subdict["text"] == "I like cats. I like dogs."
+    test_obj.subdict["text"] = "I like cats."
+    test_obj._check_add_space_after_full_stop()
+    assert test_obj.subdict["text"] == "I like cats."
+    test_obj.subdict["text"] = "www.icanhascheezburger.com"
+    test_obj._check_add_space_after_full_stop()
+    assert test_obj.subdict["text"] == "www. icanhascheezburger. com"
+
+
@pytest.mark.gcv
 def test_analyse_image(set_testdict, set_environ, accepted):
    for item in set_testdict:
--- a/ammico/text.py
+++ b/ammico/text.py
@ -4,6 +4,7 @@ from googletrans import Translator
 import spacy
 import io
 import os
+import re
 from ammico.utils import AnalysisMethod
 import grpc
 import pandas as pd
@ -225,6 +226,39 @@ class TextDetector(AnalysisMethod):
            spacy.cli.download("en_core_web_md")
            self.nlp = spacy.load("en_core_web_md")

+    def _check_add_space_after_full_stop(self):
+        """Add a space after a full stop. Required by googletrans."""
+        # we have found text, now we check for full stops
+        index_stop = [
+            i.start() for i in re.finditer("\.", self.subdict["text"])  # noqa
+        ]
+        if not index_stop:  # no full stops found
+            return
+        # check if this includes the last string item
+        end_of_list = False
+        if len(self.subdict["text"]) <= (index_stop[-1] + 1):
+            # the last found full stop is at the end of the string
+            # but we can include all others
+            if len(index_stop) == 1:
+                end_of_list = True
+            else:
+                index_stop.pop()
+        if end_of_list:  # only one full stop at end of string
+            return
+        # if this is not the end of the list, check if there is a space after the full stop
+        no_space = [i for i in index_stop if self.subdict["text"][i + 1] != " "]
+        if not no_space:  # all full stops have a space after them
+            return
+        # else, amend the text
+        add_one = 1
+        for i in no_space:
+            self.subdict["text"] = (
+                self.subdict["text"][: i + add_one]
+                + " "
+                + self.subdict["text"][i + add_one :]
+            )
+            add_one += 1
+
    def analyse_image(self) -> dict:
        """Perform text extraction and analysis of the text.

@ -239,13 +273,7 @@ class TextDetector(AnalysisMethod):
        else:
            # make sure all full stops are followed by whitespace
            # otherwise googletrans breaks
-            index_stop = self.subdict["text"].find(".")
-            if self.subdict["text"][index_stop + 1] != " ":
-                self.subdict["text"] = (
-                    self.subdict["text"][: index_stop + 1]
-                    + " "
-                    + self.subdict["text"][index_stop + 1 :]
-                )
+            self._check_add_space_after_full_stop()
            self.translate_text()
            self.remove_linebreaks()
            if self.analyse_text:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "hatchling.build"

 [project]
 name = "ammico"
-version = "0.2.3"
+version = "0.2.4"
 description = "AI Media and Misinformation Content Analysis Tool"
 readme = "README.md"
 maintainers = [