Add text analyzer to skip text extraction from image (#199)

* read in text from csv

* add tests for csv reading

* run textanalyzer in demo notebook

* add text analyser in doc and demo

* improve init TextDetector testing

* more init tests

* add csv encoding keyword

* add utf16-csv file

* skip csv reading on windows
Этот коммит содержится в:
Inga Ulusoy 2024-06-05 09:28:28 +02:00 коммит произвёл GitHub
родитель 9202f51d9f
Коммит 4ac760e690
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
9 изменённых файлов: 328 добавлений и 14 удалений

Просмотреть файл

@ -8,7 +8,7 @@ from ammico.display import AnalysisExplorer
from ammico.faces import EmotionDetector from ammico.faces import EmotionDetector
from ammico.multimodal_search import MultimodalSearch from ammico.multimodal_search import MultimodalSearch
from ammico.summary import SummaryDetector from ammico.summary import SummaryDetector
from ammico.text import TextDetector, PostprocessText from ammico.text import TextDetector, TextAnalyzer, PostprocessText
from ammico.utils import find_files, get_dataframe from ammico.utils import find_files, get_dataframe
# Export the version defined in project metadata # Export the version defined in project metadata
@ -23,6 +23,7 @@ __all__ = [
"MultimodalSearch", "MultimodalSearch",
"SummaryDetector", "SummaryDetector",
"TextDetector", "TextDetector",
"TextAnalyzer",
"PostprocessText", "PostprocessText",
"find_files", "find_files",
"get_dataframe", "get_dataframe",

8
ammico/data/ref/test.csv Обычный файл
Просмотреть файл

@ -0,0 +1,8 @@
text, date
this is a test, 05/31/24
bu bir denemedir, 05/31/24
dies ist ein Test, 05/31/24
c'est un test, 05/31/24
esto es una prueba, 05/31/24
detta är ett test, 05/31/24
1 text date
2 this is a test 05/31/24
3 bu bir denemedir 05/31/24
4 dies ist ein Test 05/31/24
5 c'est un test 05/31/24
6 esto es una prueba 05/31/24
7 detta är ett test 05/31/24

Просмотреть файл

@ -366,6 +366,94 @@
"image_df.to_csv(\"/content/drive/MyDrive/misinformation-data/data_out.csv\")" "image_df.to_csv(\"/content/drive/MyDrive/misinformation-data/data_out.csv\")"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read in a csv file containing text and translating/analysing the text\n",
"\n",
"Instead of extracting text from an image, or to re-process text that was already extracted, it is also possible to provide a `csv` file containing text in its rows.\n",
"Provide the path and name of the csv file with the keyword `csv_path`. The keyword `column_key` tells the Analyzer which column key in the csv file holds the text that should be analyzed. This defaults to \"text\"."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ta = ammico.TextAnalyzer(csv_path=\"../data/ref/test.csv\", column_key=\"text\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# read the csv file\n",
"ta.read_csv()\n",
"# set up the dict containing all text entries\n",
"text_dict = ta.mydict"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# set the dump file\n",
"# dump file name\n",
"dump_file = \"dump_file.csv\"\n",
"# dump every N images \n",
"dump_every = 10"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# analyze the csv file\n",
"for num, key in tqdm(enumerate(text_dict.keys()), total=len(text_dict)): # loop through all text entries\n",
" ammico.TextDetector(text_dict[key], analyse_text=True, skip_extraction=True).analyse_image() # analyse text with TextDetector and update dict\n",
" if num % dump_every == 0 | num == len(text_dict) - 1: # save results every dump_every to dump_file\n",
" image_df = ammico.get_dataframe(text_dict)\n",
" image_df.to_csv(dump_file)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# save the results to a csv file\n",
"text_df = ammico.get_dataframe(text_dict)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# inspect\n",
"text_df.head(3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# write to csv\n",
"text_df.to_csv(\"data_out.csv\")"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},

Двоичные данные
ammico/test/data/test-utf16.csv Обычный файл

Двоичный файл не отображается.
1 text date
2 this is a test 05/31/24
3 bu bir denemedir 05/31/24
4 dies ist ein Test 05/31/24
5 c'est un test 05/31/24
6 esto es una prueba 05/31/24
7 detta är ett test 05/31/24

8
ammico/test/data/test.csv Обычный файл
Просмотреть файл

@ -0,0 +1,8 @@
text, date
this is a test, 05/31/24
bu bir denemedir, 05/31/24
dies ist ein Test, 05/31/24
c'est un test, 05/31/24
esto es una prueba, 05/31/24
detta är ett test, 05/31/24
1 text date
2 this is a test 05/31/24
3 bu bir denemedir 05/31/24
4 dies ist ein Test 05/31/24
5 c'est un test 05/31/24
6 esto es una prueba 05/31/24
7 detta är ett test 05/31/24

32
ammico/test/data/test_read_csv_ref.json Обычный файл
Просмотреть файл

@ -0,0 +1,32 @@
{
"test.csvrow-1":
{
"filename": "test.csv",
"text": "this is a test"
},
"test.csvrow-2":
{
"filename": "test.csv",
"text": "bu bir denemedir"
},
"test.csvrow-3":
{
"filename": "test.csv",
"text": "dies ist ein Test"
},
"test.csvrow-4":
{
"filename": "test.csv",
"text": "c'est un test"
},
"test.csvrow-5":
{
"filename": "test.csv",
"text": "esto es una prueba"
},
"test.csvrow-6":
{
"filename": "test.csv",
"text": "detta är ett test"
}
}

Просмотреть файл

@ -1,6 +1,8 @@
import pytest import pytest
import ammico.text as tt import ammico.text as tt
import spacy import spacy
import json
import sys
@pytest.fixture @pytest.fixture
@ -25,10 +27,25 @@ LANGUAGES = ["de", "en", "en"]
def test_TextDetector(set_testdict): def test_TextDetector(set_testdict):
for item in set_testdict: for item in set_testdict:
test_obj = tt.TextDetector(set_testdict[item]) test_obj = tt.TextDetector(set_testdict[item])
assert test_obj.subdict["text"] is None
assert test_obj.subdict["text_language"] is None
assert test_obj.subdict["text_english"] is None
assert not test_obj.analyse_text assert not test_obj.analyse_text
assert not test_obj.skip_extraction
assert test_obj.subdict["filename"] == set_testdict[item]["filename"]
assert test_obj.model_summary == "sshleifer/distilbart-cnn-12-6"
assert (
test_obj.model_sentiment
== "distilbert-base-uncased-finetuned-sst-2-english"
)
assert test_obj.model_ner == "dbmdz/bert-large-cased-finetuned-conll03-english"
assert test_obj.revision_summary == "a4f8f3e"
assert test_obj.revision_sentiment == "af0f99b"
assert test_obj.revision_ner == "f2482bf"
test_obj = tt.TextDetector({}, analyse_text=True, skip_extraction=True)
assert test_obj.analyse_text
assert test_obj.skip_extraction
with pytest.raises(ValueError):
tt.TextDetector({}, analyse_text=1.0)
with pytest.raises(ValueError):
tt.TextDetector({}, skip_extraction=1.0)
def test_run_spacy(set_testdict, get_path): def test_run_spacy(set_testdict, get_path):
@ -140,7 +157,6 @@ def test_remove_linebreaks():
assert test_obj.subdict["text_english"] == "This is another test." assert test_obj.subdict["text_english"] == "This is another test."
@pytest.mark.win_skip
def test_text_summary(get_path): def test_text_summary(get_path):
mydict = {} mydict = {}
test_obj = tt.TextDetector(mydict, analyse_text=True) test_obj = tt.TextDetector(mydict, analyse_text=True)
@ -162,7 +178,6 @@ def test_text_sentiment_transformers():
assert mydict["sentiment_score"] == pytest.approx(0.99, 0.02) assert mydict["sentiment_score"] == pytest.approx(0.99, 0.02)
@pytest.mark.win_skip
def test_text_ner(): def test_text_ner():
mydict = {} mydict = {}
test_obj = tt.TextDetector(mydict, analyse_text=True) test_obj = tt.TextDetector(mydict, analyse_text=True)
@ -172,7 +187,51 @@ def test_text_ner():
assert mydict["entity_type"] == ["PER", "LOC"] assert mydict["entity_type"] == ["PER", "LOC"]
@pytest.mark.win_skip def test_init_csv_option(get_path):
test_obj = tt.TextAnalyzer(csv_path=get_path + "test.csv")
assert test_obj.csv_path == get_path + "test.csv"
assert test_obj.column_key == "text"
assert test_obj.csv_encoding == "utf-8"
test_obj = tt.TextAnalyzer(
csv_path=get_path + "test.csv", column_key="mytext", csv_encoding="utf-16"
)
assert test_obj.column_key == "mytext"
assert test_obj.csv_encoding == "utf-16"
with pytest.raises(ValueError):
tt.TextAnalyzer(csv_path=1.0)
with pytest.raises(ValueError):
tt.TextAnalyzer(csv_path="something")
with pytest.raises(FileNotFoundError):
tt.TextAnalyzer(csv_path=get_path + "test_no.csv")
with pytest.raises(ValueError):
tt.TextAnalyzer(csv_path=get_path + "test.csv", column_key=1.0)
with pytest.raises(ValueError):
tt.TextAnalyzer(csv_path=get_path + "test.csv", csv_encoding=1.0)
@pytest.mark.skipif(sys.platform == "win32", reason="Encoding different on Window")
def test_read_csv(get_path):
test_obj = tt.TextAnalyzer(csv_path=get_path + "test.csv")
test_obj.read_csv()
with open(get_path + "test_read_csv_ref.json", "r") as file:
ref_dict = json.load(file)
# we are assuming the order did not get jungled up
for (_, value_test), (_, value_ref) in zip(
test_obj.mydict.items(), ref_dict.items()
):
assert value_test["text"] == value_ref["text"]
# test with different encoding
test_obj = tt.TextAnalyzer(
csv_path=get_path + "test-utf16.csv", csv_encoding="utf-16"
)
test_obj.read_csv()
# we are assuming the order did not get jungled up
for (_, value_test), (_, value_ref) in zip(
test_obj.mydict.items(), ref_dict.items()
):
assert value_test["text"] == value_ref["text"]
def test_PostprocessText(set_testdict, get_path): def test_PostprocessText(set_testdict, get_path):
reference_dict = "THE\nALGEBRAIC\nEIGENVALUE\nPROBLEM\nDOM\nNVS TIO\nMINA\nMonographs\non Numerical Analysis\nJ.. H. WILKINSON" reference_dict = "THE\nALGEBRAIC\nEIGENVALUE\nPROBLEM\nDOM\nNVS TIO\nMINA\nMonographs\non Numerical Analysis\nJ.. H. WILKINSON"
reference_df = "Mathematische Formelsammlung\nfür Ingenieure und Naturwissenschaftler\nMit zahlreichen Abbildungen und Rechenbeispielen\nund einer ausführlichen Integraltafel\n3., verbesserte Auflage" reference_df = "Mathematische Formelsammlung\nfür Ingenieure und Naturwissenschaftler\nMit zahlreichen Abbildungen und Rechenbeispielen\nund einer ausführlichen Integraltafel\n3., verbesserte Auflage"

Просмотреть файл

@ -15,6 +15,7 @@ class TextDetector(AnalysisMethod):
self, self,
subdict: dict, subdict: dict,
analyse_text: bool = False, analyse_text: bool = False,
skip_extraction: bool = False,
model_names: list = None, model_names: list = None,
revision_numbers: list = None, revision_numbers: list = None,
) -> None: ) -> None:
@ -25,6 +26,8 @@ class TextDetector(AnalysisMethod):
analysis results from other modules. analysis results from other modules.
analyse_text (bool, optional): Decide if extracted text will be further subject analyse_text (bool, optional): Decide if extracted text will be further subject
to analysis. Defaults to False. to analysis. Defaults to False.
skip_extraction (bool, optional): Decide if text will be extracted from images or
is already provided via a csv. Defaults to False.
model_names (list, optional): Provide model names for summary, sentiment and ner model_names (list, optional): Provide model names for summary, sentiment and ner
analysis. Defaults to None, in which case the default model from transformers analysis. Defaults to None, in which case the default model from transformers
are used (as of 03/2023): "sshleifer/distilbart-cnn-12-6" (summary), are used (as of 03/2023): "sshleifer/distilbart-cnn-12-6" (summary),
@ -40,11 +43,21 @@ class TextDetector(AnalysisMethod):
"f2482bf" (NER, bert). "f2482bf" (NER, bert).
""" """
super().__init__(subdict) super().__init__(subdict)
self.subdict.update(self.set_keys()) # disable this for now
# maybe it would be better to initialize the keys differently
# the reason is that they are inconsistent depending on the selected
# options, and also this may not be really necessary and rather restrictive
# self.subdict.update(self.set_keys())
self.translator = Translator() self.translator = Translator()
if not isinstance(analyse_text, bool): if not isinstance(analyse_text, bool):
raise ValueError("analyse_text needs to be set to true or false") raise ValueError("analyse_text needs to be set to true or false")
self.analyse_text = analyse_text self.analyse_text = analyse_text
self.skip_extraction = skip_extraction
if not isinstance(skip_extraction, bool):
raise ValueError("skip_extraction needs to be set to true or false")
if self.skip_extraction:
print("Skipping text extraction from image.")
print("Reading text directly from provided dictionary.")
if self.analyse_text: if self.analyse_text:
self._initialize_spacy() self._initialize_spacy()
if model_names: if model_names:
@ -155,6 +168,7 @@ class TextDetector(AnalysisMethod):
Returns: Returns:
dict: The updated dictionary with text analysis results. dict: The updated dictionary with text analysis results.
""" """
if not self.skip_extraction:
self.get_text_from_image() self.get_text_from_image()
self.translate_text() self.translate_text()
self.remove_linebreaks() self.remove_linebreaks()
@ -287,18 +301,32 @@ class TextDetector(AnalysisMethod):
class TextAnalyzer: class TextAnalyzer:
"""Used to get text from a csv and then run the TextDetector on it.""" """Used to get text from a csv and then run the TextDetector on it."""
def __init__(self, csv_path: str, column_key: str = None) -> None: def __init__(
self, csv_path: str, column_key: str = None, csv_encoding: str = "utf-8"
) -> None:
"""Init the TextTranslator class. """Init the TextTranslator class.
Args: Args:
csv_path (str): Path to the CSV file containing the text entries. csv_path (str): Path to the CSV file containing the text entries.
column_key (str): Key for the column containing the text entries. column_key (str): Key for the column containing the text entries.
Defaults to None. Defaults to None.
csv_encoding (str): Encoding of the CSV file. Defaults to "utf-8".
""" """
self.csv_path = csv_path self.csv_path = csv_path
self.column_key = column_key self.column_key = column_key
self.csv_encoding = csv_encoding
self._check_valid_csv_path() self._check_valid_csv_path()
self._check_file_exists() self._check_file_exists()
if not self.column_key:
print("No column key provided - using 'text' as default.")
self.column_key = "text"
if not self.csv_encoding:
print("No encoding provided - using 'utf-8' as default.")
self.csv_encoding = "utf-8"
if not isinstance(self.column_key, str):
raise ValueError("The provided column key is not a string.")
if not isinstance(self.csv_encoding, str):
raise ValueError("The provided encoding is not a string.")
def _check_valid_csv_path(self): def _check_valid_csv_path(self):
if not isinstance(self.csv_path, str): if not isinstance(self.csv_path, str):
@ -319,9 +347,7 @@ class TextAnalyzer:
Returns: Returns:
dict: The dictionary with the text entries. dict: The dictionary with the text entries.
""" """
df = pd.read_csv(self.csv_path, encoding="utf8") df = pd.read_csv(self.csv_path, encoding=self.csv_encoding)
if not self.column_key:
self.column_key = "text"
if self.column_key not in df: if self.column_key not in df:
raise ValueError( raise ValueError(

Просмотреть файл

@ -94,7 +94,10 @@
"import os\n", "import os\n",
"import ammico\n", "import ammico\n",
"# for displaying a progress bar\n", "# for displaying a progress bar\n",
"from tqdm import tqdm" "from tqdm import tqdm\n",
"# to get the reference data for text_dict\n",
"import importlib_resources\n",
"pkg = importlib_resources.files(\"ammico\")"
] ]
}, },
{ {
@ -363,6 +366,95 @@
"image_df.to_csv(\"/content/drive/MyDrive/misinformation-data/data_out.csv\")" "image_df.to_csv(\"/content/drive/MyDrive/misinformation-data/data_out.csv\")"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read in a csv file containing text and translating/analysing the text\n",
"\n",
"Instead of extracting text from an image, or to re-process text that was already extracted, it is also possible to provide a `csv` file containing text in its rows.\n",
"Provide the path and name of the csv file with the keyword `csv_path`. The keyword `column_key` tells the Analyzer which column key in the csv file holds the text that should be analyzed. This defaults to \"text\"."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"csv_path = pkg / \"data\" / \"ref\" / \"test.csv\"\n",
"ta = ammico.TextAnalyzer(csv_path=str(csv_path), column_key=\"text\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# read the csv file\n",
"ta.read_csv()\n",
"# set up the dict containing all text entries\n",
"text_dict = ta.mydict"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# set the dump file\n",
"# dump file name\n",
"dump_file = \"dump_file.csv\"\n",
"# dump every N images \n",
"dump_every = 10"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# analyze the csv file\n",
"for num, key in tqdm(enumerate(text_dict.keys()), total=len(text_dict)): # loop through all text entries\n",
" ammico.TextDetector(text_dict[key], analyse_text=True, skip_extraction=True).analyse_image() # analyse text with TextDetector and update dict\n",
" if num % dump_every == 0 | num == len(text_dict) - 1: # save results every dump_every to dump_file\n",
" image_df = ammico.get_dataframe(text_dict)\n",
" image_df.to_csv(dump_file)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# save the results to a csv file\n",
"text_df = ammico.get_dataframe(text_dict)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# inspect\n",
"text_df.head(3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# write to csv\n",
"text_df.to_csv(\"data_out.csv\")"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},