AMMICO/misinformation/test/test_multimodal_search.py

import pytest
import math
from PIL import Image
import numpy
from torch import device, cuda, no_grad
from lavis.models import load_model_and_preprocess
import misinformation.multimodal_search as ms

TEST_IMAGE_1 = "./test/data/d755771b-225e-432f-802e-fb8dc850fff7.png"
TEST_IMAGE_2 = "./test/data/IMG_2746.png"
TEST_IMAGE_3 = "./test/data/IMG_2750.png"
TEST_IMAGE_4 = "./test/data/IMG_2805.png"
TEST_IMAGE_5 = "./test/data/IMG_2806.png"
TEST_IMAGE_6 = "./test/data/IMG_2807.png"
TEST_IMAGE_7 = "./test/data/IMG_2808.png"
TEST_IMAGE_8 = "./test/data/IMG_2809.png"
TEST_IMAGE_9 = "./test/data/IMG_3755.jpg"
TEST_IMAGE_10 = "./test/data/IMG_3756.jpg"
TEST_IMAGE_11 = "./test/data/IMG_3757.jpg"
TEST_IMAGE_12 = "./test/data/pic1.png"
related_error = 1e-3
gpu_is_not_available = not cuda.is_available()


def test_read_img():
    my_dict = {}
    test_img = ms.MultimodalSearch.read_img(my_dict, TEST_IMAGE_2)
    assert list(numpy.array(test_img)[257][34]) == [70, 66, 63]


@pytest.mark.skipif(gpu_is_not_available, reason="model for gpu only")
def test_load_feature_extractor_model_blip2():
    my_dict = {}
    multimodal_device = device("cuda" if cuda.is_available() else "cpu")
    (
        model,
        vis_processor,
        txt_processor,
    ) = ms.MultimodalSearch.load_feature_extractor_model_blip2(
        my_dict, multimodal_device
    )
    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
    test_querry = (
        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
    )
    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
    processed_text = txt_processor["eval"](test_querry)

    extracted_feature_img = model.extract_features(
        {"image": processed_pic, "text_input": ""}, mode="image"
    )
    extracted_feature_text = model.extract_features(
        {"image": "", "text_input": processed_text}, mode="text"
    )
    check_list_processed_pic = [
        -1.0039474964141846,
        -1.0039474964141846,
        -0.8433647751808167,
        -0.6097899675369263,
        -0.5951915383338928,
        -0.6243883967399597,
        -0.6827820539474487,
        -0.6097899675369263,
        -0.7119789123535156,
        -1.0623412132263184,
    ]
    for i, num in zip(range(10), processed_pic[0, 0, 0, 25:35].tolist()):
        assert (
            math.isclose(num, check_list_processed_pic[i], rel_tol=related_error)
            is True
        )

    assert (
        processed_text
        == "the bird sat on a tree located at the intersection of 23rd and 43rd streets"
    )

    check_list_extracted_feature_img = [
        0.04566730558872223,
        -0.042554520070552826,
        -0.06970272958278656,
        -0.009771779179573059,
        0.01446065679192543,
        0.10173682868480682,
        0.007092420011758804,
        -0.020045937970280647,
        0.12923966348171234,
        0.006452132016420364,
    ]
    for i, num in zip(
        range(10), extracted_feature_img["image_embeds_proj"][0, 0, 10:20].tolist()
    ):
        assert (
            math.isclose(
                num, check_list_extracted_feature_img[i], rel_tol=related_error
            )
            is True
        )

    check_list_extracted_feature_text = [
        -0.1384519338607788,
        -0.008663734421133995,
        0.006240826100111008,
        0.031466349959373474,
        0.060625165700912476,
        -0.03230545297265053,
        0.01585903950035572,
        -0.11856520175933838,
        -0.05823372304439545,
        0.036941494792699814,
    ]
    for i, num in zip(
        range(10), extracted_feature_text["text_embeds_proj"][0, 0, 10:20].tolist()
    ):
        assert (
            math.isclose(
                num, check_list_extracted_feature_text[i], rel_tol=related_error
            )
            is True
        )

    image_paths = [TEST_IMAGE_2, TEST_IMAGE_3]

    raw_images, images_tensors = ms.MultimodalSearch.read_and_process_images(
        my_dict, image_paths, vis_processor
    )

    assert list(numpy.array(raw_images[0])[257][34]) == [70, 66, 63]

    check_list_images_tensors = [
        -1.0039474964141846,
        -1.0039474964141846,
        -0.8433647751808167,
        -0.6097899675369263,
        -0.5951915383338928,
        -0.6243883967399597,
        -0.6827820539474487,
        -0.6097899675369263,
        -0.7119789123535156,
        -1.0623412132263184,
    ]
    for i, num in zip(range(10), images_tensors[0, 0, 0, 0, 25:35].tolist()):
        assert (
            math.isclose(num, check_list_images_tensors[i], rel_tol=related_error)
            is True
        )

    del model, vis_processor, txt_processor
    cuda.empty_cache()


@pytest.mark.parametrize(
    ("multimodal_device"),
    [
        device("cpu"),
        pytest.param(
            device("cuda"),
            marks=pytest.mark.skipif(
                gpu_is_not_available, reason="gpu_is_not_availible"
            ),
        ),
    ],
)
def test_load_feature_extractor_model_blip(multimodal_device):
    my_dict = {}
    (
        model,
        vis_processor,
        txt_processor,
    ) = ms.MultimodalSearch.load_feature_extractor_model_blip(
        my_dict, multimodal_device
    )
    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
    test_querry = (
        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
    )
    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
    processed_text = txt_processor["eval"](test_querry)

    with no_grad():
        extracted_feature_img = model.extract_features(
            {"image": processed_pic, "text_input": ""}, mode="image"
        )
        extracted_feature_text = model.extract_features(
            {"image": "", "text_input": processed_text}, mode="text"
        )

    check_list_processed_pic = [
        -1.0039474964141846,
        -1.0039474964141846,
        -0.8433647751808167,
        -0.6097899675369263,
        -0.5951915383338928,
        -0.6243883967399597,
        -0.6827820539474487,
        -0.6097899675369263,
        -0.7119789123535156,
        -1.0623412132263184,
    ]
    for i, num in zip(range(10), processed_pic[0, 0, 0, 25:35].tolist()):
        assert (
            math.isclose(num, check_list_processed_pic[i], rel_tol=related_error)
            is True
        )

    assert (
        processed_text
        == "the bird sat on a tree located at the intersection of 23rd and 43rd streets"
    )

    check_list_extracted_feature_img = [
        -0.02480311505496502,
        0.05037587881088257,
        0.039517853409051895,
        -0.06994109600782394,
        -0.12886561453342438,
        0.047039758414030075,
        -0.11620642244815826,
        -0.003398326924070716,
        -0.07324369996786118,
        0.06994668394327164,
    ]
    for i, num in zip(
        range(10), extracted_feature_img["image_embeds_proj"][0, 0, 10:20].tolist()
    ):
        assert (
            math.isclose(
                num, check_list_extracted_feature_img[i], rel_tol=related_error
            )
            is True
        )

    check_list_extracted_feature_text = [
        0.0118643119931221,
        -0.01291718054562807,
        -0.0009687161073088646,
        0.01428765058517456,
        -0.05591396614909172,
        0.07386433333158493,
        -0.11475936323404312,
        0.01620068959891796,
        0.0062415082938969135,
        0.0034833091776818037,
    ]
    for i, num in zip(
        range(10), extracted_feature_text["text_embeds_proj"][0, 0, 10:20].tolist()
    ):
        assert (
            math.isclose(
                num, check_list_extracted_feature_text[i], rel_tol=related_error
            )
            is True
        )

    del model, vis_processor, txt_processor
    cuda.empty_cache()


@pytest.mark.parametrize(
    ("multimodal_device"),
    [
        device("cpu"),
        pytest.param(
            device("cuda"),
            marks=pytest.mark.skipif(
                gpu_is_not_available, reason="gpu_is_not_availible"
            ),
        ),
    ],
)
def test_load_feature_extractor_model_albef(multimodal_device):
    my_dict = {}
    (
        model,
        vis_processor,
        txt_processor,
    ) = ms.MultimodalSearch.load_feature_extractor_model_albef(
        my_dict, multimodal_device
    )
    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
    test_querry = (
        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
    )
    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
    processed_text = txt_processor["eval"](test_querry)

    with no_grad():
        extracted_feature_img = model.extract_features(
            {"image": processed_pic, "text_input": ""}, mode="image"
        )
        extracted_feature_text = model.extract_features(
            {"image": "", "text_input": processed_text}, mode="text"
        )

    check_list_processed_pic = [
        -1.0039474964141846,
        -1.0039474964141846,
        -0.8433647751808167,
        -0.6097899675369263,
        -0.5951915383338928,
        -0.6243883967399597,
        -0.6827820539474487,
        -0.6097899675369263,
        -0.7119789123535156,
        -1.0623412132263184,
    ]
    for i, num in zip(range(10), processed_pic[0, 0, 0, 25:35].tolist()):
        assert (
            math.isclose(num, check_list_processed_pic[i], rel_tol=related_error)
            is True
        )

    assert (
        processed_text
        == "the bird sat on a tree located at the intersection of 23rd and 43rd streets"
    )

    check_list_extracted_feature_img = [
        0.08971136063337326,
        -0.10915573686361313,
        -0.020636577159166336,
        0.048121627420186996,
        -0.05943416804075241,
        -0.129856139421463,
        -0.0034469354432076216,
        0.017888527363538742,
        -0.03284582123160362,
        -0.1037328764796257,
    ]
    for i, num in zip(
        range(10), extracted_feature_img["image_embeds_proj"][0, 0, 10:20].tolist()
    ):
        assert (
            math.isclose(
                num, check_list_extracted_feature_img[i], rel_tol=related_error
            )
            is True
        )

    check_list_extracted_feature_text = [
        -0.06229640915989876,
        0.11278597265481949,
        0.06628583371639252,
        0.1649140566587448,
        0.068987175822258,
        0.006291372701525688,
        0.03244050219655037,
        -0.049556829035282135,
        0.050752390176057816,
        -0.0421440489590168,
    ]
    for i, num in zip(
        range(10), extracted_feature_text["text_embeds_proj"][0, 0, 10:20].tolist()
    ):
        assert (
            math.isclose(
                num, check_list_extracted_feature_text[i], rel_tol=related_error
            )
            is True
        )

    del model, vis_processor, txt_processor
    cuda.empty_cache()


@pytest.mark.parametrize(
    ("multimodal_device"),
    [
        device("cpu"),
        pytest.param(
            device("cuda"),
            marks=pytest.mark.skipif(
                gpu_is_not_available, reason="gpu_is_not_availible"
            ),
        ),
    ],
)
def test_load_feature_extractor_model_clip_base(multimodal_device):
    my_dict = {}
    (
        model,
        vis_processor,
        txt_processor,
    ) = ms.MultimodalSearch.load_feature_extractor_model_clip_base(
        my_dict, multimodal_device
    )
    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
    test_querry = (
        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
    )
    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
    processed_text = txt_processor["eval"](test_querry)

    with no_grad():
        extracted_feature_img = model.extract_features({"image": processed_pic})
        extracted_feature_text = model.extract_features({"text_input": processed_text})

    check_list_processed_pic = [
        -0.7995694875717163,
        -0.7849710583686829,
        -0.7849710583686829,
        -0.7703726291656494,
        -0.7703726291656494,
        -0.7849710583686829,
        -0.7849710583686829,
        -0.7703726291656494,
        -0.7703726291656494,
        -0.7703726291656494,
    ]
    for i, num in zip(range(10), processed_pic[0, 0, 0, 25:35].tolist()):
        assert (
            math.isclose(num, check_list_processed_pic[i], rel_tol=related_error)
            is True
        )

    assert (
        processed_text
        == "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
    )

    check_list_extracted_feature_img = [
        0.15101124346256256,
        -0.03759124130010605,
        -0.40093156695365906,
        -0.32228705286979675,
        0.1576370894908905,
        -0.23340347409248352,
        -0.3892208933830261,
        0.20170584321022034,
        -0.030034437775611877,
        0.19082790613174438,
    ]
    for i, num in zip(range(10), extracted_feature_img[0, 10:20].tolist()):
        assert (
            math.isclose(
                num, check_list_extracted_feature_img[i], rel_tol=related_error
            )
            is True
        )

    check_list_extracted_feature_text = [
        0.15391531586647034,
        0.3078577518463135,
        0.21737979352474213,
        0.0775114893913269,
        -0.3013279139995575,
        0.2806251049041748,
        -0.0407320111989975,
        -0.02664487063884735,
        -0.1858849972486496,
        0.20347601175308228,
    ]
    for i, num in zip(range(10), extracted_feature_text[0, 10:20].tolist()):
        assert (
            math.isclose(
                num, check_list_extracted_feature_text[i], rel_tol=related_error
            )
            is True
        )

    del model, vis_processor, txt_processor
    cuda.empty_cache()


@pytest.mark.parametrize(
    ("multimodal_device"),
    [
        device("cpu"),
        pytest.param(
            device("cuda"),
            marks=pytest.mark.skipif(
                gpu_is_not_available, reason="gpu_is_not_availible"
            ),
        ),
    ],
)
def test_load_feature_extractor_model_clip_vitl14(multimodal_device):
    my_dict = {}
    (
        model,
        vis_processor,
        txt_processor,
    ) = ms.MultimodalSearch.load_feature_extractor_model_clip_vitl14(
        my_dict, multimodal_device
    )
    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
    test_querry = (
        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
    )
    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
    processed_text = txt_processor["eval"](test_querry)

    with no_grad():
        extracted_feature_img = model.extract_features({"image": processed_pic})
        extracted_feature_text = model.extract_features({"text_input": processed_text})

    check_list_processed_pic = [
        -0.7995694875717163,
        -0.7849710583686829,
        -0.7849710583686829,
        -0.7703726291656494,
        -0.7703726291656494,
        -0.7849710583686829,
        -0.7849710583686829,
        -0.7703726291656494,
        -0.7703726291656494,
        -0.7703726291656494,
    ]
    for i, num in zip(range(10), processed_pic[0, 0, 0, 25:35].tolist()):
        assert (
            math.isclose(num, check_list_processed_pic[i], rel_tol=related_error)
            is True
        )

    assert (
        processed_text
        == "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
    )

    check_list_extracted_feature_img = [
        -0.3911527395248413,
        -0.35456305742263794,
        0.5724918842315674,
        0.3184954524040222,
        0.23444902896881104,
        -0.14105141162872314,
        0.26309096813201904,
        -0.0559774711728096,
        0.19491413235664368,
        0.01419895887374878,
    ]
    for i, num in zip(range(10), extracted_feature_img[0, 10:20].tolist()):
        assert (
            math.isclose(
                num, check_list_extracted_feature_img[i], rel_tol=related_error
            )
            is True
        )

    check_list_extracted_feature_text = [
        -0.07539052516222,
        0.0939129889011383,
        -0.2643853425979614,
        -0.2459949105978012,
        0.2387947291135788,
        -0.5204038023948669,
        -0.514020562171936,
        -0.32557412981987,
        0.18563221395015717,
        -0.3183072805404663,
    ]
    for i, num in zip(range(10), extracted_feature_text[0, 10:20].tolist()):
        assert (
            math.isclose(
                num, check_list_extracted_feature_text[i], rel_tol=related_error
            )
            is True
        )

    del model, vis_processor, txt_processor
    cuda.empty_cache()


@pytest.mark.parametrize(
    ("multimodal_device"),
    [
        device("cpu"),
        pytest.param(
            device("cuda"),
            marks=pytest.mark.skipif(
                gpu_is_not_available, reason="gpu_is_not_availible"
            ),
        ),
    ],
)
def test_load_feature_extractor_model_clip_vitl14_336(multimodal_device):
    my_dict = {}
    (
        model,
        vis_processor,
        txt_processor,
    ) = ms.MultimodalSearch.load_feature_extractor_model_clip_vitl14_336(
        my_dict, multimodal_device
    )
    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
    test_querry = (
        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
    )
    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
    processed_text = txt_processor["eval"](test_querry)

    with no_grad():
        extracted_feature_img = model.extract_features({"image": processed_pic})
        extracted_feature_text = model.extract_features({"text_input": processed_text})

    check_list_processed_pic = [
        -0.7995694875717163,
        -0.7849710583686829,
        -0.7849710583686829,
        -0.7849710583686829,
        -0.7849710583686829,
        -0.7849710583686829,
        -0.7849710583686829,
        -0.9163569211959839,
        -1.149931788444519,
        -1.0039474964141846,
    ]
    for i, num in zip(range(10), processed_pic[0, 0, 0, 25:35].tolist()):
        assert (
            math.isclose(num, check_list_processed_pic[i], rel_tol=related_error)
            is True
        )

    assert (
        processed_text
        == "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
    )

    check_list_extracted_feature_img = [
        -0.15060146152973175,
        -0.1998099535703659,
        0.5503129363059998,
        0.2589969336986542,
        -0.0182882659137249,
        -0.12753525376319885,
        0.018985718488693237,
        -0.17110440135002136,
        0.02220013737678528,
        0.01086437702178955,
    ]
    for i, num in zip(range(10), extracted_feature_img[0, 10:20].tolist()):
        assert (
            math.isclose(
                num, check_list_extracted_feature_img[i], rel_tol=related_error
            )
            is True
        )

    check_list_extracted_feature_text = [
        -0.1172553077340126,
        0.07105237245559692,
        -0.283934086561203,
        -0.24353823065757751,
        0.22662702202796936,
        -0.472959041595459,
        -0.5191791653633118,
        -0.29402273893356323,
        0.22669515013694763,
        -0.32044747471809387,
    ]
    for i, num in zip(range(10), extracted_feature_text[0, 10:20].tolist()):
        assert (
            math.isclose(
                num, check_list_extracted_feature_text[i], rel_tol=related_error
            )
            is True
        )

    del model, vis_processor, txt_processor
    cuda.empty_cache()


model_type = "blip"
# model_type = "blip2"
# model_type = "albef"
# model_type = "clip_base"
# model_type = "clip_vitl14"
# model_type = "clip_vitl14_336"


pre_proc_pic_blip2_blip_albef = [
    -1.0039474964141846,
    -1.0039474964141846,
    -0.8433647751808167,
    -0.6097899675369263,
    -0.5951915383338928,
    -0.6243883967399597,
    -0.6827820539474487,
    -0.6097899675369263,
    -0.7119789123535156,
    -1.0623412132263184,
]
pre_proc_pic_clip_vitl14 = [
    -0.7995694875717163,
    -0.7849710583686829,
    -0.7849710583686829,
    -0.7703726291656494,
    -0.7703726291656494,
    -0.7849710583686829,
    -0.7849710583686829,
    -0.7703726291656494,
    -0.7703726291656494,
    -0.7703726291656494,
]

pre_proc_pic_clip_vitl14_336 = [
    -0.7995694875717163,
    -0.7849710583686829,
    -0.7849710583686829,
    -0.7849710583686829,
    -0.7849710583686829,
    -0.7849710583686829,
    -0.7849710583686829,
    -0.9163569211959839,
    -1.149931788444519,
    -1.0039474964141846,
]

pre_proc_text_blip2_blip_albef = (
    "the bird sat on a tree located at the intersection of 23rd and 43rd streets"
)

pre_proc_text_clip_clip_vitl14_clip_vitl14_336 = (
    "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
)

pre_extracted_feature_img_blip2 = [
    0.04566730558872223,
    -0.042554520070552826,
    -0.06970272958278656,
    -0.009771779179573059,
    0.01446065679192543,
    0.10173682868480682,
    0.007092420011758804,
    -0.020045937970280647,
    0.12923966348171234,
    0.006452132016420364,
]

pre_extracted_feature_img_blip = [
    -0.02480311505496502,
    0.05037587881088257,
    0.039517853409051895,
    -0.06994109600782394,
    -0.12886561453342438,
    0.047039758414030075,
    -0.11620642244815826,
    -0.003398326924070716,
    -0.07324369996786118,
    0.06994668394327164,
]

pre_extracted_feature_img_albef = [
    0.08971136063337326,
    -0.10915573686361313,
    -0.020636577159166336,
    0.048121627420186996,
    -0.05943416804075241,
    -0.129856139421463,
    -0.0034469354432076216,
    0.017888527363538742,
    -0.03284582123160362,
    -0.1037328764796257,
]

pre_extracted_feature_img_clip = [
    0.01621132344007492,
    -0.004035486374050379,
    -0.04304071143269539,
    -0.03459808602929115,
    0.016922621056437492,
    -0.025056276470422745,
    -0.04178355261683464,
    0.02165347896516323,
    -0.003224249929189682,
    0.020485712215304375,
]

pre_extracted_feature_img_parsing_clip = [
    0.01621132344007492,
    -0.004035486374050379,
    -0.04304071143269539,
    -0.03459808602929115,
    0.016922621056437492,
    -0.025056276470422745,
    -0.04178355261683464,
    0.02165347896516323,
    -0.003224249929189682,
    0.020485712215304375,
]

pre_extracted_feature_img_clip_vitl14 = [
    -0.023943455889821053,
    -0.021703708916902542,
    0.035043686628341675,
    0.019495919346809387,
    0.014351222664117813,
    -0.008634116500616074,
    0.01610446907579899,
    -0.003426523646339774,
    0.011931191198527813,
    0.0008691544644534588,
]

pre_extracted_feature_img_clip_vitl14_336 = [
    -0.15060146152973175,
    -0.1998099535703659,
    0.5503129363059998,
    0.2589969336986542,
    -0.0182882659137249,
    -0.12753525376319885,
    0.018985718488693237,
    -0.17110440135002136,
    0.02220013737678528,
    0.01086437702178955,
]

pre_extracted_feature_text_blip2 = [
    -0.1384204626083374,
    -0.008662976324558258,
    0.006269007455557585,
    0.03151319921016693,
    0.060558050870895386,
    -0.03230040520429611,
    0.015861615538597107,
    -0.11856459826231003,
    -0.058296192437410355,
    0.03699290752410889,
]

pre_extracted_feature_text_blip = [
    0.0118643119931221,
    -0.01291718054562807,
    -0.0009687161073088646,
    0.01428765058517456,
    -0.05591396614909172,
    0.07386433333158493,
    -0.11475936323404312,
    0.01620068959891796,
    0.0062415082938969135,
    0.0034833091776818037,
]

pre_extracted_feature_text_albef = [
    -0.06229640915989876,
    0.11278597265481949,
    0.06628583371639252,
    0.1649140566587448,
    0.068987175822258,
    0.006291372701525688,
    0.03244050219655037,
    -0.049556829035282135,
    0.050752390176057816,
    -0.0421440489590168,
]

pre_extracted_feature_text_clip = [
    0.018169036135077477,
    0.03634127229452133,
    0.025660742074251175,
    0.009149895049631596,
    -0.035570453852415085,
    0.033126577734947205,
    -0.004808237310498953,
    -0.0031453112605959177,
    -0.02194291725754738,
    0.024019461125135422,
]

pre_extracted_feature_text_clip_vitl14 = [
    -0.0055463071912527084,
    0.006908962037414312,
    -0.019450219348073006,
    -0.018097277730703354,
    0.017567576840519905,
    -0.03828490898013115,
    -0.03781530633568764,
    -0.023951737210154533,
    0.01365653332322836,
    -0.02341713197529316,
]

pre_extracted_feature_text_clip_vitl14_336 = [
    -0.1172553077340126,
    0.07105237245559692,
    -0.283934086561203,
    -0.24353823065757751,
    0.22662702202796936,
    -0.472959041595459,
    -0.5191791653633118,
    -0.29402273893356323,
    0.22669515013694763,
    -0.32044747471809387,
]


@pytest.mark.parametrize(
    (
        "pre_multimodal_device",
        "pre_model",
        "pre_proc_pic",
        "pre_proc_text",
        "pre_extracted_feature_img",
        "pre_extracted_feature_text",
        "pre_extracted_feature_img2",
    ),
    [
        pytest.param(
            device("cuda"),
            "blip2",
            pre_proc_pic_blip2_blip_albef,
            pre_proc_text_blip2_blip_albef,
            pre_extracted_feature_img_blip2,
            pre_extracted_feature_text_blip2,
            pre_extracted_feature_img_blip2,
            marks=pytest.mark.skipif(
                gpu_is_not_available, reason="gpu_is_not_availible"
            ),
        ),
        (
            device("cuda"),
            "blip",
            pre_proc_pic_blip2_blip_albef,
            pre_proc_text_blip2_blip_albef,
            pre_extracted_feature_img_blip,
            pre_extracted_feature_text_blip,
            pre_extracted_feature_img_blip,
        ),
        pytest.param(
            device("cuda"),
            "blip",
            pre_proc_pic_blip2_blip_albef,
            pre_proc_text_blip2_blip_albef,
            pre_extracted_feature_img_blip,
            pre_extracted_feature_text_blip,
            pre_extracted_feature_img_blip,
            marks=pytest.mark.skipif(
                gpu_is_not_available, reason="gpu_is_not_availible"
            ),
        ),
        (
            device("cpu"),
            "albef",
            pre_proc_pic_blip2_blip_albef,
            pre_proc_text_blip2_blip_albef,
            pre_extracted_feature_img_albef,
            pre_extracted_feature_text_albef,
            pre_extracted_feature_img_albef,
        ),
        pytest.param(
            device("cuda"),
            "albef",
            pre_proc_pic_blip2_blip_albef,
            pre_proc_text_blip2_blip_albef,
            pre_extracted_feature_img_albef,
            pre_extracted_feature_text_albef,
            pre_extracted_feature_img_albef,
            marks=pytest.mark.skipif(
                gpu_is_not_available, reason="gpu_is_not_availible"
            ),
        ),
        (
            device("cpu"),
            "clip_base",
            pre_proc_pic_clip_vitl14,
            pre_proc_text_clip_clip_vitl14_clip_vitl14_336,
            pre_extracted_feature_img_parsing_clip,
            pre_extracted_feature_text_clip,
            pre_extracted_feature_img_clip,
        ),
        pytest.param(
            device("cuda"),
            "clip_base",
            pre_proc_pic_clip_vitl14,
            pre_proc_text_clip_clip_vitl14_clip_vitl14_336,
            pre_extracted_feature_img_parsing_clip,
            pre_extracted_feature_text_clip,
            pre_extracted_feature_img_clip,
            marks=pytest.mark.skipif(
                gpu_is_not_available, reason="gpu_is_not_availible"
            ),
        ),
        (
            device("cpu"),
            "clip_vitl14",
            pre_proc_pic_clip_vitl14,
            pre_proc_text_clip_clip_vitl14_clip_vitl14_336,
            pre_extracted_feature_img_clip_vitl14,
            pre_extracted_feature_text_clip_vitl14,
            pre_extracted_feature_img_clip_vitl14,
        ),
        pytest.param(
            device("cuda"),
            "clip_vitl14",
            pre_proc_pic_clip_vitl14,
            pre_proc_text_clip_clip_vitl14_clip_vitl14_336,
            pre_extracted_feature_img_clip_vitl14,
            pre_extracted_feature_text_clip_vitl14,
            pre_extracted_feature_img_clip_vitl14,
            marks=pytest.mark.skipif(
                gpu_is_not_available, reason="gpu_is_not_availible"
            ),
        ),
        #        (device("cpu"),"clip_vitl14_336"),
        #
        #
        #
        #        pytest.param( device("cuda"),"clip_vitl14_336", marks=pytest.mark.skipif(gpu_is_not_available, reason="gpu_is_not_availible"),),
    ],
)
def test_parsing_images(
    pre_multimodal_device,
    pre_model,
    pre_proc_pic,
    pre_proc_text,
    pre_extracted_feature_img,
    pre_extracted_feature_text,
    pre_extracted_feature_img2,
):
    mydict = {
        "IMG_2746": {"filename": "./test/data/IMG_2746.png"},
        "IMG_2750": {"filename": "./test/data/IMG_2750.png"},
    }
    ms.MultimodalSearch.multimodal_device = pre_multimodal_device
    (
        model,
        vis_processor,
        txt_processor,
        image_keys,
        image_names,
        features_image_stacked,
    ) = ms.MultimodalSearch.parsing_images(mydict, pre_model)

    for i, num in zip(range(10), features_image_stacked[0, 10:20].tolist()):
        assert (
            math.isclose(num, pre_extracted_feature_img[i], rel_tol=related_error)
            is True
        )

    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
    test_querry = (
        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
    )
    processed_pic = (
        vis_processor["eval"](test_pic).unsqueeze(0).to(pre_multimodal_device)
    )
    processed_text = txt_processor["eval"](test_querry)

    for i, num in zip(range(10), processed_pic[0, 0, 0, 25:35].tolist()):
        assert math.isclose(num, pre_proc_pic[i], rel_tol=related_error) is True

    assert processed_text == pre_proc_text

    search_query = [
        {"text_input": test_querry},
        {"image": TEST_IMAGE_2},
    ]
    multi_features_stacked = ms.MultimodalSearch.querys_processing(
        mydict, search_query, model, txt_processor, vis_processor, pre_model
    )

    for i, num in zip(range(10), multi_features_stacked[0, 10:20].tolist()):
        assert (
            math.isclose(num, pre_extracted_feature_text[i], rel_tol=related_error)
            is True
        )

    for i, num in zip(range(10), multi_features_stacked[1, 10:20].tolist()):
        assert (
            math.isclose(num, pre_extracted_feature_img2[i], rel_tol=related_error)
            is True
        )

    del model, vis_processor, txt_processor
    cuda.empty_cache()