зеркало из
https://github.com/ssciwr/AMMICO.git
synced 2025-10-30 05:26:05 +02:00
manage occurence of full stops in a better way (#229)
* manage occurence of full stops in a better way * bump version * cleanup
Этот коммит содержится в:
родитель
403525aa46
Коммит
e12929a909
@ -141,6 +141,19 @@ def test_init_revision_numbers_and_models(accepted):
|
|||||||
tt.TextDetector({}, revision_numbers=["something"], accept_privacy=accepted)
|
tt.TextDetector({}, revision_numbers=["something"], accept_privacy=accepted)
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_add_space_after_full_stop(accepted):
|
||||||
|
test_obj = tt.TextDetector({}, accept_privacy=accepted)
|
||||||
|
test_obj.subdict["text"] = "I like cats. I like dogs."
|
||||||
|
test_obj._check_add_space_after_full_stop()
|
||||||
|
assert test_obj.subdict["text"] == "I like cats. I like dogs."
|
||||||
|
test_obj.subdict["text"] = "I like cats."
|
||||||
|
test_obj._check_add_space_after_full_stop()
|
||||||
|
assert test_obj.subdict["text"] == "I like cats."
|
||||||
|
test_obj.subdict["text"] = "www.icanhascheezburger.com"
|
||||||
|
test_obj._check_add_space_after_full_stop()
|
||||||
|
assert test_obj.subdict["text"] == "www. icanhascheezburger. com"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.gcv
|
@pytest.mark.gcv
|
||||||
def test_analyse_image(set_testdict, set_environ, accepted):
|
def test_analyse_image(set_testdict, set_environ, accepted):
|
||||||
for item in set_testdict:
|
for item in set_testdict:
|
||||||
|
|||||||
@ -4,6 +4,7 @@ from googletrans import Translator
|
|||||||
import spacy
|
import spacy
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
from ammico.utils import AnalysisMethod
|
from ammico.utils import AnalysisMethod
|
||||||
import grpc
|
import grpc
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -225,6 +226,39 @@ class TextDetector(AnalysisMethod):
|
|||||||
spacy.cli.download("en_core_web_md")
|
spacy.cli.download("en_core_web_md")
|
||||||
self.nlp = spacy.load("en_core_web_md")
|
self.nlp = spacy.load("en_core_web_md")
|
||||||
|
|
||||||
|
def _check_add_space_after_full_stop(self):
|
||||||
|
"""Add a space after a full stop. Required by googletrans."""
|
||||||
|
# we have found text, now we check for full stops
|
||||||
|
index_stop = [
|
||||||
|
i.start() for i in re.finditer("\.", self.subdict["text"]) # noqa
|
||||||
|
]
|
||||||
|
if not index_stop: # no full stops found
|
||||||
|
return
|
||||||
|
# check if this includes the last string item
|
||||||
|
end_of_list = False
|
||||||
|
if len(self.subdict["text"]) <= (index_stop[-1] + 1):
|
||||||
|
# the last found full stop is at the end of the string
|
||||||
|
# but we can include all others
|
||||||
|
if len(index_stop) == 1:
|
||||||
|
end_of_list = True
|
||||||
|
else:
|
||||||
|
index_stop.pop()
|
||||||
|
if end_of_list: # only one full stop at end of string
|
||||||
|
return
|
||||||
|
# if this is not the end of the list, check if there is a space after the full stop
|
||||||
|
no_space = [i for i in index_stop if self.subdict["text"][i + 1] != " "]
|
||||||
|
if not no_space: # all full stops have a space after them
|
||||||
|
return
|
||||||
|
# else, amend the text
|
||||||
|
add_one = 1
|
||||||
|
for i in no_space:
|
||||||
|
self.subdict["text"] = (
|
||||||
|
self.subdict["text"][: i + add_one]
|
||||||
|
+ " "
|
||||||
|
+ self.subdict["text"][i + add_one :]
|
||||||
|
)
|
||||||
|
add_one += 1
|
||||||
|
|
||||||
def analyse_image(self) -> dict:
|
def analyse_image(self) -> dict:
|
||||||
"""Perform text extraction and analysis of the text.
|
"""Perform text extraction and analysis of the text.
|
||||||
|
|
||||||
@ -239,13 +273,7 @@ class TextDetector(AnalysisMethod):
|
|||||||
else:
|
else:
|
||||||
# make sure all full stops are followed by whitespace
|
# make sure all full stops are followed by whitespace
|
||||||
# otherwise googletrans breaks
|
# otherwise googletrans breaks
|
||||||
index_stop = self.subdict["text"].find(".")
|
self._check_add_space_after_full_stop()
|
||||||
if self.subdict["text"][index_stop + 1] != " ":
|
|
||||||
self.subdict["text"] = (
|
|
||||||
self.subdict["text"][: index_stop + 1]
|
|
||||||
+ " "
|
|
||||||
+ self.subdict["text"][index_stop + 1 :]
|
|
||||||
)
|
|
||||||
self.translate_text()
|
self.translate_text()
|
||||||
self.remove_linebreaks()
|
self.remove_linebreaks()
|
||||||
if self.analyse_text:
|
if self.analyse_text:
|
||||||
|
|||||||
@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "ammico"
|
name = "ammico"
|
||||||
version = "0.2.3"
|
version = "0.2.4"
|
||||||
description = "AI Media and Misinformation Content Analysis Tool"
|
description = "AI Media and Misinformation Content Analysis Tool"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
maintainers = [
|
maintainers = [
|
||||||
|
|||||||
Загрузка…
x
Ссылка в новой задаче
Block a user