AMMICO/misinformation/accuracy.py

import pandas as pd
import json

from misinformation import utils
from misinformation import faces


class LabelManager:
    def __init__(self):
        self.labels_code = None
        self.labels = None
        self.f_labels = None
        self.f_labels_code = None
        self.load()

    def load(self):
        self.labels_code = pd.read_excel(
            "./misinformation/test/data/EUROPE_APRMAY20_data_variable_labels_coding.xlsx",
            sheet_name="variable_labels_codings",
        )
        self.labels = pd.read_csv(
            "./misinformation/test/data/Europe_APRMAY20data190722.csv",
            sep=",",
            decimal=".",
        )
        self.map = self.read_json("./misinformation/data/map_test_set.json")

    def read_json(self, name):
        with open("{}".format(name)) as f:
            mydict = json.load(f)
        return mydict

    def get_orders(self):
        return [i["order"] for i in self.map.values()]

    def filter_from_order(self, orders: list):
        cols = []
        for order in orders:
            col = self.labels_code.iloc[order - 1, 1]
            cols.append(col.lower())

        self.f_labels_code = self.labels_code.loc[
            self.labels_code["order"].isin(orders)
        ]
        self.f_labels = self.labels[cols]

    def gen_dict(self):
        labels_dict = {}
        if self.f_labels is None:
            print("No filtered labels found")
            return labels_dict

        cols = self.f_labels.columns.tolist()
        for index, row in self.f_labels.iterrows():
            row_dict = {}
            for col in cols:
                row_dict[col] = row[col]
            labels_dict[row["pic_id"]] = row_dict

        return labels_dict

    def map_dict(self, mydict):
        mapped_dict = {}
        for id, subdict in mydict.items():
            mapped_subdict = {}
            mapped_subdict["id"] = id[0:-2]
            mapped_subdict["pic_order"] = id[-1] if id[-2] == "0" else id[-2::]
            mapped_subdict["pic_id"] = id
            for key in self.map.keys():
                # get the key name
                mydict_name = self.map[key]["variable_mydict"]
                mydict_value = self.map[key]["value_mydict"]
                # find out which value was set
                mydict_current = subdict[mydict_name]
                # now map to new key-value pair
                mapped_subdict[key] = 1 if mydict_current == mydict_value else 0
                # substitute the values that are not boolean
                if self.map[key]["variable_coding"] != "Bool":
                    mapped_subdict[key] = mydict_current
                # take only first value in lists - this applies to faces,
                # reported are up to three in a list, we compare only the
                # largest one here
                if isinstance(mydict_current, list):
                    mapped_subdict[key] = 1 if mydict_current[0] == mydict_value else 0
                    # also cut out the likelihood for detected emotion
                    if isinstance(mydict_current[0], tuple):
                        mapped_subdict[key] = (
                            1 if mydict_current[0][0] == mydict_value else 0
                        )
            mapped_dict[id] = mapped_subdict
        return mapped_dict


if __name__ == "__main__":
    files = utils.find_files(
        path="/home/inga/projects/misinformation-project/misinformation/misinformation/test/data/Europe APRMAY20 visual data/cropped images",
        limit=500,
    )
    mydict = utils.initialize_dict(files)
    # analyze faces
    image_ids = [key for key in mydict.keys()]
    for i in image_ids:
        mydict[i] = faces.EmotionDetector(mydict[i]).analyse_image()

    outdict = utils.append_data_to_dict(mydict)
    df = utils.dump_df(outdict)
    # print(df.head(10))
    df.to_csv("mydict_out.csv")

    # example of LabelManager for loading csv data to dict
    lm = LabelManager()
    # get the desired label numbers automatically
    orders = lm.get_orders()
    # map mydict to the specified variable names and values
    mydict_map = lm.map_dict(mydict)
    lm.filter_from_order([1, 2, 3] + orders)

    labels = lm.gen_dict()
    comp = {}
    for key in labels.keys():
        if str(key) not in mydict_map:
            print("Key {} not found.".format(key))
            continue
        print("ref: {}".format(labels[key]))
        print("com: {}".format(mydict_map[str(key)]))
        for subkey in labels[key]:
            if type(labels[key][subkey]) != int:
                continue
            if type(mydict_map[str(key)][subkey]) != int:
                continue
            comp[subkey] = comp.get(subkey, 0) + abs(
                labels[key][subkey] - mydict_map[str(key)][subkey]
            )
    print("summary: ")
    # why v9_5a not there - bec reads in as float from the csv
    print(comp)
    # summary:
    # {'v9_4': 42, 'v9_5b': 1579, 'v9_6': 229, 'v9_7': 45, 'v9_8': 39, 'v9_8a': 31, 'v9_9': 58, 'v9_10': 33, 'v9_11': 22, 'v9_12': 2, 'v9_13': 24, 'v11_3': 39}
    # Important here is:
    # Overall positive - 'v9_8': 39 deviations
    # Overall negative - 'v9_9': 58
    # happy - 'v9_8a': 31
    # fear - 'v9_10': 33
    # angry - 'v9_11': 22
    # disgust - 'v9_12': 2
    # sad - 'v9_13': 24
    # respect of rules = wears mask - 'v11_3': 39