зеркало из
https://github.com/ssciwr/AMMICO.git
synced 2025-10-30 21:46:04 +02:00
put clean text back in, correct default image data path (#145)
Этот коммит содержится в:
родитель
e120c10d9f
Коммит
8eb4fca75e
@ -1,5 +1,6 @@
|
||||
import pytest
|
||||
import ammico.text as tt
|
||||
import spacy
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@ -30,6 +31,26 @@ def test_TextDetector(set_testdict):
|
||||
assert not test_obj.analyse_text
|
||||
|
||||
|
||||
def test_run_spacy(set_testdict, get_path):
|
||||
test_obj = tt.TextDetector(set_testdict["IMG_3755"], analyse_text=True)
|
||||
ref_file = get_path + "text_IMG_3755.txt"
|
||||
with open(ref_file, "r") as file:
|
||||
reference_text = file.read()
|
||||
test_obj.subdict["text_english"] = reference_text
|
||||
test_obj._run_spacy()
|
||||
assert isinstance(test_obj.doc, spacy.tokens.doc.Doc)
|
||||
|
||||
|
||||
def test_clean_text(set_testdict):
|
||||
nlp = spacy.load("en_core_web_md")
|
||||
doc = nlp("I like cats and fjejg")
|
||||
test_obj = tt.TextDetector(set_testdict["IMG_3755"])
|
||||
test_obj.doc = doc
|
||||
test_obj.clean_text()
|
||||
result = "I like cats and"
|
||||
assert test_obj.subdict["text_clean"] == result
|
||||
|
||||
|
||||
def test_init_revision_numbers_and_models():
|
||||
test_obj = tt.TextDetector({})
|
||||
# check the default options
|
||||
|
||||
@ -45,6 +45,8 @@ class TextDetector(AnalysisMethod):
|
||||
if not isinstance(analyse_text, bool):
|
||||
raise ValueError("analyse_text needs to be set to true or false")
|
||||
self.analyse_text = analyse_text
|
||||
if self.analyse_text:
|
||||
self._initialize_spacy()
|
||||
if model_names:
|
||||
self._check_valid_models(model_names)
|
||||
if revision_numbers:
|
||||
@ -139,6 +141,14 @@ class TextDetector(AnalysisMethod):
|
||||
params = {"text": None, "text_language": None, "text_english": None}
|
||||
return params
|
||||
|
||||
def _initialize_spacy(self):
|
||||
"""Initialize the Spacy library for text analysis."""
|
||||
try:
|
||||
self.nlp = spacy.load("en_core_web_md")
|
||||
except Exception:
|
||||
spacy.cli.download("en_core_web_md")
|
||||
self.nlp = spacy.load("en_core_web_md")
|
||||
|
||||
def analyse_image(self) -> dict:
|
||||
"""Perform text extraction and analysis of the text.
|
||||
|
||||
@ -149,6 +159,8 @@ class TextDetector(AnalysisMethod):
|
||||
self.translate_text()
|
||||
self.remove_linebreaks()
|
||||
if self.analyse_text:
|
||||
self._run_spacy()
|
||||
self.clean_text()
|
||||
self.text_summary()
|
||||
self.text_sentiment_transformers()
|
||||
self.text_ner()
|
||||
@ -200,6 +212,19 @@ class TextDetector(AnalysisMethod):
|
||||
"\n", " "
|
||||
)
|
||||
|
||||
def _run_spacy(self):
|
||||
"""Generate Spacy doc object for further text analysis."""
|
||||
self.doc = self.nlp(self.subdict["text_english"])
|
||||
|
||||
def clean_text(self):
|
||||
"""Clean the text from unrecognized words and any numbers."""
|
||||
templist = []
|
||||
for token in self.doc:
|
||||
templist.append(
|
||||
token.text
|
||||
) if token.pos_ != "NUM" and token.has_vector else None
|
||||
self.subdict["text_clean"] = " ".join(templist).rstrip().lstrip()
|
||||
|
||||
def text_summary(self):
|
||||
"""Generate a summary of the text using the Transformers pipeline."""
|
||||
# use the transformers pipeline to summarize the text
|
||||
|
||||
@ -105,7 +105,7 @@ def find_files(
|
||||
|
||||
Args:
|
||||
path (str, optional): The base directory where we are looking for the images. Defaults
|
||||
to None, which uses the XDG data directory if set or the current
|
||||
to None, which uses the ammico data directory if set or the current
|
||||
working directory otherwise.
|
||||
pattern (str|list, optional): The naming pattern that the filename should match.
|
||||
Use either '.ext' or just 'ext'
|
||||
@ -122,7 +122,7 @@ def find_files(
|
||||
"""
|
||||
|
||||
if path is None:
|
||||
path = os.environ.get("XDG_DATA_HOME", ".")
|
||||
path = os.environ.get("AMMICO_DATA_HOME", ".")
|
||||
|
||||
if isinstance(pattern, str):
|
||||
pattern = [pattern]
|
||||
|
||||
Загрузка…
x
Ссылка в новой задаче
Block a user