AMMICO/misinformation/test/test_text.py

import os
import pytest
import spacy
import misinformation.text as tt
import misinformation
import pandas as pd

TESTDICT = {
    "IMG_3755": {
        "filename": "./test/data/IMG_3755.jpg",
    },
    "IMG_3756": {
        "filename": "./test/data/IMG_3756.jpg",
    },
    "IMG_3757": {
        "filename": "./test/data/IMG_3757.jpg",
    },
}

LANGUAGES = ["de", "om", "en"]

os.environ[
    "GOOGLE_APPLICATION_CREDENTIALS"
] = "../data/seismic-bonfire-329406-412821a70264.json"


def test_TextDetector():
    for item in TESTDICT:
        test_obj = tt.TextDetector(TESTDICT[item])
        assert test_obj.subdict["text"] is None
        assert test_obj.subdict["text_language"] is None
        assert test_obj.subdict["text_english"] is None
        assert not test_obj.analyse_text


@pytest.mark.gcv
def test_analyse_image():
    for item in TESTDICT:
        test_obj = tt.TextDetector(TESTDICT[item])
        test_obj.analyse_image()
        test_obj = tt.TextDetector(TESTDICT[item], analyse_text=True)
        test_obj.analyse_image()


@pytest.mark.gcv
def test_get_text_from_image():
    for item in TESTDICT:
        test_obj = tt.TextDetector(TESTDICT[item])
        test_obj.get_text_from_image()
        ref_file = "./test/data/text_" + item + ".txt"
        with open(ref_file, "r", encoding="utf8") as file:
            reference_text = file.read()
        assert test_obj.subdict["text"] == reference_text


def test_translate_text():
    for item, lang in zip(TESTDICT, LANGUAGES):
        test_obj = tt.TextDetector(TESTDICT[item])
        ref_file = "./test/data/text_" + item + ".txt"
        trans_file = "./test/data/text_translated_" + item + ".txt"
        with open(ref_file, "r", encoding="utf8") as file:
            reference_text = file.read()
        with open(trans_file, "r", encoding="utf8") as file:
            translated_text = file.read()
        test_obj.subdict["text"] = reference_text
        test_obj.translate_text()
        assert test_obj.subdict["text_language"] == lang
        assert test_obj.subdict["text_english"] == translated_text


def test_remove_linebreaks():
    test_obj = tt.TextDetector({})
    test_obj.subdict["text"] = "This is \n a test."
    test_obj.subdict["text_english"] = "This is \n another\n test."
    test_obj.remove_linebreaks()
    assert test_obj.subdict["text"] == "This is   a test."
    assert test_obj.subdict["text_english"] == "This is   another  test."


def test_run_spacy():
    test_obj = tt.TextDetector(TESTDICT["IMG_3755"], analyse_text=True)
    ref_file = "./test/data/text_IMG_3755.txt"
    with open(ref_file, "r") as file:
        reference_text = file.read()
    test_obj.subdict["text_english"] = reference_text
    test_obj._run_spacy()
    assert isinstance(test_obj.doc, spacy.tokens.doc.Doc)


def test_clean_text():
    nlp = spacy.load("en_core_web_md")
    doc = nlp("I like cats and fjejg")
    test_obj = tt.TextDetector(TESTDICT["IMG_3755"])
    test_obj.doc = doc
    test_obj.clean_text()
    result = "I like cats and"
    assert test_obj.subdict["text_clean"] == result


def test_correct_spelling():
    mydict = {}
    test_obj = tt.TextDetector(mydict, analyse_text=True)
    test_obj.subdict["text_english"] = "I lik cats ad dogs."
    test_obj.correct_spelling()
    result = "I like cats ad dogs."
    assert test_obj.subdict["text_english_correct"] == result


def test_sentiment_analysis():
    mydict = {}
    test_obj = tt.TextDetector(mydict, analyse_text=True)
    test_obj.subdict["text_english"] = "I love cats and dogs."
    test_obj._run_spacy()
    test_obj.correct_spelling()
    test_obj.sentiment_analysis()
    assert test_obj.subdict["polarity"] == 0.5
    assert test_obj.subdict["subjectivity"] == 0.6


def test_PostprocessText():
    reference_dict = "THE\nALGEBRAIC\nEIGENVALUE\nPROBLEM\nDOM\nNVS TIO\nMINA\nMonographs\non Numerical Analysis\nJ.. H. WILKINSON"
    reference_df = "Mathematische Formelsammlung\nfür Ingenieure und Naturwissenschaftler\nMit zahlreichen Abbildungen und Rechenbeispielen\nund einer ausführlichen Integraltafel\n3., verbesserte Auflage"
    obj = tt.PostprocessText(mydict=TESTDICT)
    # make sure test works on windows where end-of-line character is \r\n
    test_dict = obj.list_text_english[2].replace("\r", "")
    assert test_dict == reference_dict
    for key in TESTDICT.keys():
        TESTDICT[key].pop("text_english")
    with pytest.raises(ValueError):
        tt.PostprocessText(mydict=TESTDICT)
    obj = tt.PostprocessText(use_csv=True, csv_path="./test/data/test_data_out.csv")
    # make sure test works on windows where end-of-line character is \r\n
    test_df = obj.list_text_english[0].replace("\r", "")
    assert test_df == reference_df
    with pytest.raises(ValueError):
        tt.PostprocessText(use_csv=True, csv_path="./test/data/test_data_out_nokey.csv")
    with pytest.raises(ValueError):
        tt.PostprocessText()


def test_analyse_topic():
    _, topic_df, most_frequent_topics = tt.PostprocessText(
        use_csv=True, csv_path="./test/data/topic_analysis_test.csv"
    ).analyse_topic()
    # since this is not deterministic we cannot be sure we get the same result twice
    assert len(topic_df) == 2
    assert topic_df["Name"].iloc[0] == "0_the_feat_of_is"
    assert most_frequent_topics[0][0][0] == "the"