improve handling of exceptions for googletrans (#244)

* truncate text to max length for googletrans, skip translation in case of error * restrict peft version for backwards compatibility with old transformers required by lavis
2025-10-29 13:06:04 +02:00 · 2025-02-05 13:52:25 +01:00 · 2025-02-05 13:52:25 +01:00 · 3cf1b5466a
--- a/ammico/test/test_text.py
+++ b/ammico/test/test_text.py
@ -154,6 +154,16 @@ def test_check_add_space_after_full_stop(accepted):
    assert test_obj.subdict["text"] == "www. icanhascheezburger. com"


+def test_truncate_text(accepted):
+    test_obj = tt.TextDetector({}, accept_privacy=accepted)
+    test_obj.subdict["text"] = "I like cats and dogs."
+    test_obj._truncate_text()
+    assert test_obj.subdict["text"] == "I like cats and dogs."
+    test_obj.subdict["text"] = 20000 * "m"
+    test_obj._truncate_text()
+    assert test_obj.subdict["text"] == 5000 * "m"
+
+
@pytest.mark.gcv
 def test_analyse_image(set_testdict, set_environ, accepted):
    for item in set_testdict:
--- a/ammico/text.py
+++ b/ammico/text.py
@ -112,7 +112,7 @@ class TextDetector(AnalysisMethod):
            raise ValueError(
                "Privacy disclosure not accepted - skipping text detection."
            )
-        self.translator = Translator()
+        self.translator = Translator(raise_exception=True)
        if not isinstance(analyse_text, bool):
            raise ValueError("analyse_text needs to be set to true or false")
        self.analyse_text = analyse_text
@ -259,6 +259,12 @@ class TextDetector(AnalysisMethod):
            )
            add_one += 1

+    def _truncate_text(self, max_length: int = 5000) -> str:
+        """Truncate the text if it is too long for googletrans."""
+        if self.subdict["text"] and len(self.subdict["text"]) > max_length:
+            print("Text is too long - truncating to {} characters.".format(max_length))
+            self.subdict["text"] = self.subdict["text"][:max_length]
+
    def analyse_image(self) -> dict:
        """Perform text extraction and analysis of the text.

@ -274,6 +280,7 @@ class TextDetector(AnalysisMethod):
            # make sure all full stops are followed by whitespace
            # otherwise googletrans breaks
            self._check_add_space_after_full_stop()
+            self._truncate_text()
            self.translate_text()
            self.remove_linebreaks()
            if self.analyse_text:
@ -329,13 +336,18 @@ class TextDetector(AnalysisMethod):
            raise ValueError(
                "Privacy disclosure not accepted - skipping text translation."
            )
+        try:
            translated = self.translator.translate(self.subdict["text"])
-        self.subdict["text_language"] = translated.src
-        self.subdict["text_english"] = translated.text
+        except Exception:
+            print("Could not translate the text with error {}.".format(Exception))
+            translated = None
+            print("Skipping translation for this text.")
+        self.subdict["text_language"] = translated.src if translated else None
+        self.subdict["text_english"] = translated.text if translated else None

    def remove_linebreaks(self):
        """Remove linebreaks from original and translated text."""
-        if self.subdict["text"]:
+        if self.subdict["text"] and self.subdict["text_english"]:
            self.subdict["text"] = self.subdict["text"].replace("\n", " ")
            self.subdict["text_english"] = self.subdict["text_english"].replace(
                "\n", " "
--- a/pyproject.toml
+++ b/pyproject.toml
@ -35,6 +35,7 @@ dependencies = [
    "matplotlib",
    "numpy<=1.23.4",
    "pandas",
+    "peft<=0.13.0",
    "Pillow",
    "pooch",
    "protobuf",