manage occurence of full stops in a better way (#229)

* manage occurence of full stops in a better way

* bump version

* cleanup
Этот коммит содержится в:
Inga Ulusoy 2024-12-02 15:01:50 +01:00 коммит произвёл GitHub
родитель 403525aa46
Коммит e12929a909
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
3 изменённых файлов: 49 добавлений и 8 удалений

Просмотреть файл

@ -141,6 +141,19 @@ def test_init_revision_numbers_and_models(accepted):
tt.TextDetector({}, revision_numbers=["something"], accept_privacy=accepted)
def test_check_add_space_after_full_stop(accepted):
test_obj = tt.TextDetector({}, accept_privacy=accepted)
test_obj.subdict["text"] = "I like cats. I like dogs."
test_obj._check_add_space_after_full_stop()
assert test_obj.subdict["text"] == "I like cats. I like dogs."
test_obj.subdict["text"] = "I like cats."
test_obj._check_add_space_after_full_stop()
assert test_obj.subdict["text"] == "I like cats."
test_obj.subdict["text"] = "www.icanhascheezburger.com"
test_obj._check_add_space_after_full_stop()
assert test_obj.subdict["text"] == "www. icanhascheezburger. com"
@pytest.mark.gcv
def test_analyse_image(set_testdict, set_environ, accepted):
for item in set_testdict:

Просмотреть файл

@ -4,6 +4,7 @@ from googletrans import Translator
import spacy
import io
import os
import re
from ammico.utils import AnalysisMethod
import grpc
import pandas as pd
@ -225,6 +226,39 @@ class TextDetector(AnalysisMethod):
spacy.cli.download("en_core_web_md")
self.nlp = spacy.load("en_core_web_md")
def _check_add_space_after_full_stop(self):
"""Add a space after a full stop. Required by googletrans."""
# we have found text, now we check for full stops
index_stop = [
i.start() for i in re.finditer("\.", self.subdict["text"]) # noqa
]
if not index_stop: # no full stops found
return
# check if this includes the last string item
end_of_list = False
if len(self.subdict["text"]) <= (index_stop[-1] + 1):
# the last found full stop is at the end of the string
# but we can include all others
if len(index_stop) == 1:
end_of_list = True
else:
index_stop.pop()
if end_of_list: # only one full stop at end of string
return
# if this is not the end of the list, check if there is a space after the full stop
no_space = [i for i in index_stop if self.subdict["text"][i + 1] != " "]
if not no_space: # all full stops have a space after them
return
# else, amend the text
add_one = 1
for i in no_space:
self.subdict["text"] = (
self.subdict["text"][: i + add_one]
+ " "
+ self.subdict["text"][i + add_one :]
)
add_one += 1
def analyse_image(self) -> dict:
"""Perform text extraction and analysis of the text.
@ -239,13 +273,7 @@ class TextDetector(AnalysisMethod):
else:
# make sure all full stops are followed by whitespace
# otherwise googletrans breaks
index_stop = self.subdict["text"].find(".")
if self.subdict["text"][index_stop + 1] != " ":
self.subdict["text"] = (
self.subdict["text"][: index_stop + 1]
+ " "
+ self.subdict["text"][index_stop + 1 :]
)
self._check_add_space_after_full_stop()
self.translate_text()
self.remove_linebreaks()
if self.analyse_text:

Просмотреть файл

@ -4,7 +4,7 @@ build-backend = "hatchling.build"
[project]
name = "ammico"
version = "0.2.3"
version = "0.2.4"
description = "AI Media and Misinformation Content Analysis Tool"
readme = "README.md"
maintainers = [