put clean text back in, correct default image data path (#145)

Этот коммит содержится в:
Inga Ulusoy 2023-08-31 13:54:51 +02:00 коммит произвёл GitHub
родитель e120c10d9f
Коммит 8eb4fca75e
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 48 добавлений и 2 удалений

Просмотреть файл

@ -1,5 +1,6 @@
import pytest
import ammico.text as tt
import spacy
@pytest.fixture
@ -30,6 +31,26 @@ def test_TextDetector(set_testdict):
assert not test_obj.analyse_text
def test_run_spacy(set_testdict, get_path):
test_obj = tt.TextDetector(set_testdict["IMG_3755"], analyse_text=True)
ref_file = get_path + "text_IMG_3755.txt"
with open(ref_file, "r") as file:
reference_text = file.read()
test_obj.subdict["text_english"] = reference_text
test_obj._run_spacy()
assert isinstance(test_obj.doc, spacy.tokens.doc.Doc)
def test_clean_text(set_testdict):
nlp = spacy.load("en_core_web_md")
doc = nlp("I like cats and fjejg")
test_obj = tt.TextDetector(set_testdict["IMG_3755"])
test_obj.doc = doc
test_obj.clean_text()
result = "I like cats and"
assert test_obj.subdict["text_clean"] == result
def test_init_revision_numbers_and_models():
test_obj = tt.TextDetector({})
# check the default options

Просмотреть файл

@ -45,6 +45,8 @@ class TextDetector(AnalysisMethod):
if not isinstance(analyse_text, bool):
raise ValueError("analyse_text needs to be set to true or false")
self.analyse_text = analyse_text
if self.analyse_text:
self._initialize_spacy()
if model_names:
self._check_valid_models(model_names)
if revision_numbers:
@ -139,6 +141,14 @@ class TextDetector(AnalysisMethod):
params = {"text": None, "text_language": None, "text_english": None}
return params
def _initialize_spacy(self):
"""Initialize the Spacy library for text analysis."""
try:
self.nlp = spacy.load("en_core_web_md")
except Exception:
spacy.cli.download("en_core_web_md")
self.nlp = spacy.load("en_core_web_md")
def analyse_image(self) -> dict:
"""Perform text extraction and analysis of the text.
@ -149,6 +159,8 @@ class TextDetector(AnalysisMethod):
self.translate_text()
self.remove_linebreaks()
if self.analyse_text:
self._run_spacy()
self.clean_text()
self.text_summary()
self.text_sentiment_transformers()
self.text_ner()
@ -200,6 +212,19 @@ class TextDetector(AnalysisMethod):
"\n", " "
)
def _run_spacy(self):
"""Generate Spacy doc object for further text analysis."""
self.doc = self.nlp(self.subdict["text_english"])
def clean_text(self):
"""Clean the text from unrecognized words and any numbers."""
templist = []
for token in self.doc:
templist.append(
token.text
) if token.pos_ != "NUM" and token.has_vector else None
self.subdict["text_clean"] = " ".join(templist).rstrip().lstrip()
def text_summary(self):
"""Generate a summary of the text using the Transformers pipeline."""
# use the transformers pipeline to summarize the text

Просмотреть файл

@ -105,7 +105,7 @@ def find_files(
Args:
path (str, optional): The base directory where we are looking for the images. Defaults
to None, which uses the XDG data directory if set or the current
to None, which uses the ammico data directory if set or the current
working directory otherwise.
pattern (str|list, optional): The naming pattern that the filename should match.
Use either '.ext' or just 'ext'
@ -122,7 +122,7 @@ def find_files(
"""
if path is None:
path = os.environ.get("XDG_DATA_HOME", ".")
path = os.environ.get("AMMICO_DATA_HOME", ".")
if isinstance(pattern, str):
pattern = [pattern]