зеркало из
https://github.com/ssciwr/AMMICO.git
synced 2025-10-29 13:06:04 +02:00
manage occurence of full stops in a better way (#229)
* manage occurence of full stops in a better way * bump version * cleanup
Этот коммит содержится в:
родитель
403525aa46
Коммит
e12929a909
@ -141,6 +141,19 @@ def test_init_revision_numbers_and_models(accepted):
|
||||
tt.TextDetector({}, revision_numbers=["something"], accept_privacy=accepted)
|
||||
|
||||
|
||||
def test_check_add_space_after_full_stop(accepted):
|
||||
test_obj = tt.TextDetector({}, accept_privacy=accepted)
|
||||
test_obj.subdict["text"] = "I like cats. I like dogs."
|
||||
test_obj._check_add_space_after_full_stop()
|
||||
assert test_obj.subdict["text"] == "I like cats. I like dogs."
|
||||
test_obj.subdict["text"] = "I like cats."
|
||||
test_obj._check_add_space_after_full_stop()
|
||||
assert test_obj.subdict["text"] == "I like cats."
|
||||
test_obj.subdict["text"] = "www.icanhascheezburger.com"
|
||||
test_obj._check_add_space_after_full_stop()
|
||||
assert test_obj.subdict["text"] == "www. icanhascheezburger. com"
|
||||
|
||||
|
||||
@pytest.mark.gcv
|
||||
def test_analyse_image(set_testdict, set_environ, accepted):
|
||||
for item in set_testdict:
|
||||
|
||||
@ -4,6 +4,7 @@ from googletrans import Translator
|
||||
import spacy
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
from ammico.utils import AnalysisMethod
|
||||
import grpc
|
||||
import pandas as pd
|
||||
@ -225,6 +226,39 @@ class TextDetector(AnalysisMethod):
|
||||
spacy.cli.download("en_core_web_md")
|
||||
self.nlp = spacy.load("en_core_web_md")
|
||||
|
||||
def _check_add_space_after_full_stop(self):
|
||||
"""Add a space after a full stop. Required by googletrans."""
|
||||
# we have found text, now we check for full stops
|
||||
index_stop = [
|
||||
i.start() for i in re.finditer("\.", self.subdict["text"]) # noqa
|
||||
]
|
||||
if not index_stop: # no full stops found
|
||||
return
|
||||
# check if this includes the last string item
|
||||
end_of_list = False
|
||||
if len(self.subdict["text"]) <= (index_stop[-1] + 1):
|
||||
# the last found full stop is at the end of the string
|
||||
# but we can include all others
|
||||
if len(index_stop) == 1:
|
||||
end_of_list = True
|
||||
else:
|
||||
index_stop.pop()
|
||||
if end_of_list: # only one full stop at end of string
|
||||
return
|
||||
# if this is not the end of the list, check if there is a space after the full stop
|
||||
no_space = [i for i in index_stop if self.subdict["text"][i + 1] != " "]
|
||||
if not no_space: # all full stops have a space after them
|
||||
return
|
||||
# else, amend the text
|
||||
add_one = 1
|
||||
for i in no_space:
|
||||
self.subdict["text"] = (
|
||||
self.subdict["text"][: i + add_one]
|
||||
+ " "
|
||||
+ self.subdict["text"][i + add_one :]
|
||||
)
|
||||
add_one += 1
|
||||
|
||||
def analyse_image(self) -> dict:
|
||||
"""Perform text extraction and analysis of the text.
|
||||
|
||||
@ -239,13 +273,7 @@ class TextDetector(AnalysisMethod):
|
||||
else:
|
||||
# make sure all full stops are followed by whitespace
|
||||
# otherwise googletrans breaks
|
||||
index_stop = self.subdict["text"].find(".")
|
||||
if self.subdict["text"][index_stop + 1] != " ":
|
||||
self.subdict["text"] = (
|
||||
self.subdict["text"][: index_stop + 1]
|
||||
+ " "
|
||||
+ self.subdict["text"][index_stop + 1 :]
|
||||
)
|
||||
self._check_add_space_after_full_stop()
|
||||
self.translate_text()
|
||||
self.remove_linebreaks()
|
||||
if self.analyse_text:
|
||||
|
||||
@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "ammico"
|
||||
version = "0.2.3"
|
||||
version = "0.2.4"
|
||||
description = "AI Media and Misinformation Content Analysis Tool"
|
||||
readme = "README.md"
|
||||
maintainers = [
|
||||
|
||||
Загрузка…
x
Ссылка в новой задаче
Block a user