manage occurence of full stops in a better way (#229)

* manage occurence of full stops in a better way

* bump version

* cleanup
Этот коммит содержится в:
Inga Ulusoy 2024-12-02 15:01:50 +01:00 коммит произвёл GitHub
родитель 403525aa46
Коммит e12929a909
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
3 изменённых файлов: 49 добавлений и 8 удалений

Просмотреть файл

@ -141,6 +141,19 @@ def test_init_revision_numbers_and_models(accepted):
tt.TextDetector({}, revision_numbers=["something"], accept_privacy=accepted) tt.TextDetector({}, revision_numbers=["something"], accept_privacy=accepted)
def test_check_add_space_after_full_stop(accepted):
test_obj = tt.TextDetector({}, accept_privacy=accepted)
test_obj.subdict["text"] = "I like cats. I like dogs."
test_obj._check_add_space_after_full_stop()
assert test_obj.subdict["text"] == "I like cats. I like dogs."
test_obj.subdict["text"] = "I like cats."
test_obj._check_add_space_after_full_stop()
assert test_obj.subdict["text"] == "I like cats."
test_obj.subdict["text"] = "www.icanhascheezburger.com"
test_obj._check_add_space_after_full_stop()
assert test_obj.subdict["text"] == "www. icanhascheezburger. com"
@pytest.mark.gcv @pytest.mark.gcv
def test_analyse_image(set_testdict, set_environ, accepted): def test_analyse_image(set_testdict, set_environ, accepted):
for item in set_testdict: for item in set_testdict:

Просмотреть файл

@ -4,6 +4,7 @@ from googletrans import Translator
import spacy import spacy
import io import io
import os import os
import re
from ammico.utils import AnalysisMethod from ammico.utils import AnalysisMethod
import grpc import grpc
import pandas as pd import pandas as pd
@ -225,6 +226,39 @@ class TextDetector(AnalysisMethod):
spacy.cli.download("en_core_web_md") spacy.cli.download("en_core_web_md")
self.nlp = spacy.load("en_core_web_md") self.nlp = spacy.load("en_core_web_md")
def _check_add_space_after_full_stop(self):
"""Add a space after a full stop. Required by googletrans."""
# we have found text, now we check for full stops
index_stop = [
i.start() for i in re.finditer("\.", self.subdict["text"]) # noqa
]
if not index_stop: # no full stops found
return
# check if this includes the last string item
end_of_list = False
if len(self.subdict["text"]) <= (index_stop[-1] + 1):
# the last found full stop is at the end of the string
# but we can include all others
if len(index_stop) == 1:
end_of_list = True
else:
index_stop.pop()
if end_of_list: # only one full stop at end of string
return
# if this is not the end of the list, check if there is a space after the full stop
no_space = [i for i in index_stop if self.subdict["text"][i + 1] != " "]
if not no_space: # all full stops have a space after them
return
# else, amend the text
add_one = 1
for i in no_space:
self.subdict["text"] = (
self.subdict["text"][: i + add_one]
+ " "
+ self.subdict["text"][i + add_one :]
)
add_one += 1
def analyse_image(self) -> dict: def analyse_image(self) -> dict:
"""Perform text extraction and analysis of the text. """Perform text extraction and analysis of the text.
@ -239,13 +273,7 @@ class TextDetector(AnalysisMethod):
else: else:
# make sure all full stops are followed by whitespace # make sure all full stops are followed by whitespace
# otherwise googletrans breaks # otherwise googletrans breaks
index_stop = self.subdict["text"].find(".") self._check_add_space_after_full_stop()
if self.subdict["text"][index_stop + 1] != " ":
self.subdict["text"] = (
self.subdict["text"][: index_stop + 1]
+ " "
+ self.subdict["text"][index_stop + 1 :]
)
self.translate_text() self.translate_text()
self.remove_linebreaks() self.remove_linebreaks()
if self.analyse_text: if self.analyse_text:

Просмотреть файл

@ -4,7 +4,7 @@ build-backend = "hatchling.build"
[project] [project]
name = "ammico" name = "ammico"
version = "0.2.3" version = "0.2.4"
description = "AI Media and Misinformation Content Analysis Tool" description = "AI Media and Misinformation Content Analysis Tool"
readme = "README.md" readme = "README.md"
maintainers = [ maintainers = [