diff --git a/ammico/test/test_text.py b/ammico/test/test_text.py index 5ebb00d..129f1f1 100644 --- a/ammico/test/test_text.py +++ b/ammico/test/test_text.py @@ -154,6 +154,16 @@ def test_check_add_space_after_full_stop(accepted): assert test_obj.subdict["text"] == "www. icanhascheezburger. com" +def test_truncate_text(accepted): + test_obj = tt.TextDetector({}, accept_privacy=accepted) + test_obj.subdict["text"] = "I like cats and dogs." + test_obj._truncate_text() + assert test_obj.subdict["text"] == "I like cats and dogs." + test_obj.subdict["text"] = 20000 * "m" + test_obj._truncate_text() + assert test_obj.subdict["text"] == 5000 * "m" + + @pytest.mark.gcv def test_analyse_image(set_testdict, set_environ, accepted): for item in set_testdict: diff --git a/ammico/text.py b/ammico/text.py index 0d020af..ca2516b 100644 --- a/ammico/text.py +++ b/ammico/text.py @@ -112,7 +112,7 @@ class TextDetector(AnalysisMethod): raise ValueError( "Privacy disclosure not accepted - skipping text detection." ) - self.translator = Translator() + self.translator = Translator(raise_exception=True) if not isinstance(analyse_text, bool): raise ValueError("analyse_text needs to be set to true or false") self.analyse_text = analyse_text @@ -259,6 +259,12 @@ class TextDetector(AnalysisMethod): ) add_one += 1 + def _truncate_text(self, max_length: int = 5000) -> str: + """Truncate the text if it is too long for googletrans.""" + if self.subdict["text"] and len(self.subdict["text"]) > max_length: + print("Text is too long - truncating to {} characters.".format(max_length)) + self.subdict["text"] = self.subdict["text"][:max_length] + def analyse_image(self) -> dict: """Perform text extraction and analysis of the text. @@ -274,6 +280,7 @@ class TextDetector(AnalysisMethod): # make sure all full stops are followed by whitespace # otherwise googletrans breaks self._check_add_space_after_full_stop() + self._truncate_text() self.translate_text() self.remove_linebreaks() if self.analyse_text: @@ -329,13 +336,18 @@ class TextDetector(AnalysisMethod): raise ValueError( "Privacy disclosure not accepted - skipping text translation." ) - translated = self.translator.translate(self.subdict["text"]) - self.subdict["text_language"] = translated.src - self.subdict["text_english"] = translated.text + try: + translated = self.translator.translate(self.subdict["text"]) + except Exception: + print("Could not translate the text with error {}.".format(Exception)) + translated = None + print("Skipping translation for this text.") + self.subdict["text_language"] = translated.src if translated else None + self.subdict["text_english"] = translated.text if translated else None def remove_linebreaks(self): """Remove linebreaks from original and translated text.""" - if self.subdict["text"]: + if self.subdict["text"] and self.subdict["text_english"]: self.subdict["text"] = self.subdict["text"].replace("\n", " ") self.subdict["text_english"] = self.subdict["text_english"].replace( "\n", " " diff --git a/pyproject.toml b/pyproject.toml index cbd55be..c2b3bd6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dependencies = [ "matplotlib", "numpy<=1.23.4", "pandas", + "peft<=0.13.0", "Pillow", "pooch", "protobuf",