manage occurence of full stops in a better way (#229)

* manage occurence of full stops in a better way * bump version * cleanup
2025-10-30 05:26:05 +02:00 · 2024-12-02 15:01:50 +01:00 · 2024-12-02 15:01:50 +01:00 · e12929a909
--- a/ammico/test/test_text.py
+++ b/ammico/test/test_text.py
@ -141,6 +141,19 @@ def test_init_revision_numbers_and_models(accepted):
        tt.TextDetector({}, revision_numbers=["something"], accept_privacy=accepted)
 def test_check_add_space_after_full_stop(accepted):
    test_obj = tt.TextDetector({}, accept_privacy=accepted)
    test_obj.subdict["text"] = "I like cats. I like dogs."
    test_obj._check_add_space_after_full_stop()
    assert test_obj.subdict["text"] == "I like cats. I like dogs."
    test_obj.subdict["text"] = "I like cats."
    test_obj._check_add_space_after_full_stop()
    assert test_obj.subdict["text"] == "I like cats."
    test_obj.subdict["text"] = "www.icanhascheezburger.com"
    test_obj._check_add_space_after_full_stop()
    assert test_obj.subdict["text"] == "www. icanhascheezburger. com"
@pytest.mark.gcv
 def test_analyse_image(set_testdict, set_environ, accepted):
    for item in set_testdict:
--- a/ammico/text.py
+++ b/ammico/text.py
@ -4,6 +4,7 @@ from googletrans import Translator
 import spacy
 import io
 import os
 import re
 from ammico.utils import AnalysisMethod
 import grpc
 import pandas as pd
@ -225,6 +226,39 @@ class TextDetector(AnalysisMethod):
            spacy.cli.download("en_core_web_md")
            self.nlp = spacy.load("en_core_web_md")
    def _check_add_space_after_full_stop(self):
        """Add a space after a full stop. Required by googletrans."""
        # we have found text, now we check for full stops
        index_stop = [
            i.start() for i in re.finditer("\.", self.subdict["text"])  # noqa
        ]
        if not index_stop:  # no full stops found
            return
        # check if this includes the last string item
        end_of_list = False
        if len(self.subdict["text"]) <= (index_stop[-1] + 1):
            # the last found full stop is at the end of the string
            # but we can include all others
            if len(index_stop) == 1:
                end_of_list = True
            else:
                index_stop.pop()
        if end_of_list:  # only one full stop at end of string
            return
        # if this is not the end of the list, check if there is a space after the full stop
        no_space = [i for i in index_stop if self.subdict["text"][i + 1] != " "]
        if not no_space:  # all full stops have a space after them
            return
        # else, amend the text
        add_one = 1
        for i in no_space:
            self.subdict["text"] = (
                self.subdict["text"][: i + add_one]
                + " "
                + self.subdict["text"][i + add_one :]
            )
            add_one += 1
    def analyse_image(self) -> dict:
        """Perform text extraction and analysis of the text.
@ -239,13 +273,7 @@ class TextDetector(AnalysisMethod):
        else:
            # make sure all full stops are followed by whitespace
            # otherwise googletrans breaks
-            index_stop = self.subdict["text"].find(".")
+            self._check_add_space_after_full_stop()
            if self.subdict["text"][index_stop + 1] != " ":
                self.subdict["text"] = (
                    self.subdict["text"][: index_stop + 1]
                    + " "
                    + self.subdict["text"][index_stop + 1 :]
                )
            self.translate_text()
            self.remove_linebreaks()
            if self.analyse_text:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "ammico"
-version = "0.2.3"
+version = "0.2.4"
 description = "AI Media and Misinformation Content Analysis Tool"
 readme = "README.md"
 maintainers = [