From 04592825e6508f095a2e9eed91fd42425b9895cf Mon Sep 17 00:00:00 2001
From: Petr Andriushchenko <pitandmind@gmail.com>
Date: Tue, 7 Mar 2023 10:37:35 +0100
Subject: [PATCH] fixed tests in test_multimodal_search.py

---
 misinformation/test/test_multimodal_search.py | 706 +-----------------
 1 file changed, 41 insertions(+), 665 deletions(-)

diff --git a/misinformation/test/test_multimodal_search.py b/misinformation/test/test_multimodal_search.py
index 51e1fe8..740bf35 100644
--- a/misinformation/test/test_multimodal_search.py
+++ b/misinformation/test/test_multimodal_search.py
@@ -28,646 +28,6 @@ def test_read_img():
     assert list(numpy.array(test_img)[257][34]) == [70, 66, 63]
 
 
-@pytest.mark.skipif(gpu_is_not_available, reason="model for gpu only")
-def test_load_feature_extractor_model_blip2():
-    my_dict = {}
-    multimodal_device = device("cuda" if cuda.is_available() else "cpu")
-    (
-        model,
-        vis_processor,
-        txt_processor,
-    ) = ms.MultimodalSearch.load_feature_extractor_model_blip2(
-        my_dict, multimodal_device
-    )
-    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
-    test_querry = (
-        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
-    )
-    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
-    processed_text = txt_processor["eval"](test_querry)
-
-    extracted_feature_img = model.extract_features(
-        {"image": processed_pic, "text_input": ""}, mode="image"
-    )
-    extracted_feature_text = model.extract_features(
-        {"image": "", "text_input": processed_text}, mode="text"
-    )
-    check_list_processed_pic = [
-        -1.0039474964141846,
-        -1.0039474964141846,
-        -0.8433647751808167,
-        -0.6097899675369263,
-        -0.5951915383338928,
-        -0.6243883967399597,
-        -0.6827820539474487,
-        -0.6097899675369263,
-        -0.7119789123535156,
-        -1.0623412132263184,
-    ]
-    for i, num in zip(range(10), processed_pic[0, 0, 0, 25:35].tolist()):
-        assert (
-            math.isclose(num, check_list_processed_pic[i], rel_tol=related_error)
-            is True
-        )
-
-    assert (
-        processed_text
-        == "the bird sat on a tree located at the intersection of 23rd and 43rd streets"
-    )
-
-    check_list_extracted_feature_img = [
-        0.04566730558872223,
-        -0.042554520070552826,
-        -0.06970272958278656,
-        -0.009771779179573059,
-        0.01446065679192543,
-        0.10173682868480682,
-        0.007092420011758804,
-        -0.020045937970280647,
-        0.12923966348171234,
-        0.006452132016420364,
-    ]
-    for i, num in zip(
-        range(10), extracted_feature_img["image_embeds_proj"][0, 0, 10:20].tolist()
-    ):
-        assert (
-            math.isclose(
-                num, check_list_extracted_feature_img[i], rel_tol=related_error
-            )
-            is True
-        )
-
-    check_list_extracted_feature_text = [
-        -0.1384519338607788,
-        -0.008663734421133995,
-        0.006240826100111008,
-        0.031466349959373474,
-        0.060625165700912476,
-        -0.03230545297265053,
-        0.01585903950035572,
-        -0.11856520175933838,
-        -0.05823372304439545,
-        0.036941494792699814,
-    ]
-    for i, num in zip(
-        range(10), extracted_feature_text["text_embeds_proj"][0, 0, 10:20].tolist()
-    ):
-        assert (
-            math.isclose(
-                num, check_list_extracted_feature_text[i], rel_tol=related_error
-            )
-            is True
-        )
-
-    image_paths = [TEST_IMAGE_2, TEST_IMAGE_3]
-
-    raw_images, images_tensors = ms.MultimodalSearch.read_and_process_images(
-        my_dict, image_paths, vis_processor
-    )
-
-    assert list(numpy.array(raw_images[0])[257][34]) == [70, 66, 63]
-
-    check_list_images_tensors = [
-        -1.0039474964141846,
-        -1.0039474964141846,
-        -0.8433647751808167,
-        -0.6097899675369263,
-        -0.5951915383338928,
-        -0.6243883967399597,
-        -0.6827820539474487,
-        -0.6097899675369263,
-        -0.7119789123535156,
-        -1.0623412132263184,
-    ]
-    for i, num in zip(range(10), images_tensors[0, 0, 0, 0, 25:35].tolist()):
-        assert (
-            math.isclose(num, check_list_images_tensors[i], rel_tol=related_error)
-            is True
-        )
-
-    del model, vis_processor, txt_processor
-    cuda.empty_cache()
-
-
-@pytest.mark.parametrize(
-    ("multimodal_device"),
-    [
-        device("cpu"),
-        pytest.param(
-            device("cuda"),
-            marks=pytest.mark.skipif(
-                gpu_is_not_available, reason="gpu_is_not_availible"
-            ),
-        ),
-    ],
-)
-def test_load_feature_extractor_model_blip(multimodal_device):
-    my_dict = {}
-    (
-        model,
-        vis_processor,
-        txt_processor,
-    ) = ms.MultimodalSearch.load_feature_extractor_model_blip(
-        my_dict, multimodal_device
-    )
-    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
-    test_querry = (
-        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
-    )
-    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
-    processed_text = txt_processor["eval"](test_querry)
-
-    with no_grad():
-        extracted_feature_img = model.extract_features(
-            {"image": processed_pic, "text_input": ""}, mode="image"
-        )
-        extracted_feature_text = model.extract_features(
-            {"image": "", "text_input": processed_text}, mode="text"
-        )
-
-    check_list_processed_pic = [
-        -1.0039474964141846,
-        -1.0039474964141846,
-        -0.8433647751808167,
-        -0.6097899675369263,
-        -0.5951915383338928,
-        -0.6243883967399597,
-        -0.6827820539474487,
-        -0.6097899675369263,
-        -0.7119789123535156,
-        -1.0623412132263184,
-    ]
-    for i, num in zip(range(10), processed_pic[0, 0, 0, 25:35].tolist()):
-        assert (
-            math.isclose(num, check_list_processed_pic[i], rel_tol=related_error)
-            is True
-        )
-
-    assert (
-        processed_text
-        == "the bird sat on a tree located at the intersection of 23rd and 43rd streets"
-    )
-
-    check_list_extracted_feature_img = [
-        -0.02480311505496502,
-        0.05037587881088257,
-        0.039517853409051895,
-        -0.06994109600782394,
-        -0.12886561453342438,
-        0.047039758414030075,
-        -0.11620642244815826,
-        -0.003398326924070716,
-        -0.07324369996786118,
-        0.06994668394327164,
-    ]
-    for i, num in zip(
-        range(10), extracted_feature_img["image_embeds_proj"][0, 0, 10:20].tolist()
-    ):
-        assert (
-            math.isclose(
-                num, check_list_extracted_feature_img[i], rel_tol=related_error
-            )
-            is True
-        )
-
-    check_list_extracted_feature_text = [
-        0.0118643119931221,
-        -0.01291718054562807,
-        -0.0009687161073088646,
-        0.01428765058517456,
-        -0.05591396614909172,
-        0.07386433333158493,
-        -0.11475936323404312,
-        0.01620068959891796,
-        0.0062415082938969135,
-        0.0034833091776818037,
-    ]
-    for i, num in zip(
-        range(10), extracted_feature_text["text_embeds_proj"][0, 0, 10:20].tolist()
-    ):
-        assert (
-            math.isclose(
-                num, check_list_extracted_feature_text[i], rel_tol=related_error
-            )
-            is True
-        )
-
-    del model, vis_processor, txt_processor
-    cuda.empty_cache()
-
-
-@pytest.mark.parametrize(
-    ("multimodal_device"),
-    [
-        device("cpu"),
-        pytest.param(
-            device("cuda"),
-            marks=pytest.mark.skipif(
-                gpu_is_not_available, reason="gpu_is_not_availible"
-            ),
-        ),
-    ],
-)
-def test_load_feature_extractor_model_albef(multimodal_device):
-    my_dict = {}
-    (
-        model,
-        vis_processor,
-        txt_processor,
-    ) = ms.MultimodalSearch.load_feature_extractor_model_albef(
-        my_dict, multimodal_device
-    )
-    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
-    test_querry = (
-        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
-    )
-    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
-    processed_text = txt_processor["eval"](test_querry)
-
-    with no_grad():
-        extracted_feature_img = model.extract_features(
-            {"image": processed_pic, "text_input": ""}, mode="image"
-        )
-        extracted_feature_text = model.extract_features(
-            {"image": "", "text_input": processed_text}, mode="text"
-        )
-
-    check_list_processed_pic = [
-        -1.0039474964141846,
-        -1.0039474964141846,
-        -0.8433647751808167,
-        -0.6097899675369263,
-        -0.5951915383338928,
-        -0.6243883967399597,
-        -0.6827820539474487,
-        -0.6097899675369263,
-        -0.7119789123535156,
-        -1.0623412132263184,
-    ]
-    for i, num in zip(range(10), processed_pic[0, 0, 0, 25:35].tolist()):
-        assert (
-            math.isclose(num, check_list_processed_pic[i], rel_tol=related_error)
-            is True
-        )
-
-    assert (
-        processed_text
-        == "the bird sat on a tree located at the intersection of 23rd and 43rd streets"
-    )
-
-    check_list_extracted_feature_img = [
-        0.08971136063337326,
-        -0.10915573686361313,
-        -0.020636577159166336,
-        0.048121627420186996,
-        -0.05943416804075241,
-        -0.129856139421463,
-        -0.0034469354432076216,
-        0.017888527363538742,
-        -0.03284582123160362,
-        -0.1037328764796257,
-    ]
-    for i, num in zip(
-        range(10), extracted_feature_img["image_embeds_proj"][0, 0, 10:20].tolist()
-    ):
-        assert (
-            math.isclose(
-                num, check_list_extracted_feature_img[i], rel_tol=related_error
-            )
-            is True
-        )
-
-    check_list_extracted_feature_text = [
-        -0.06229640915989876,
-        0.11278597265481949,
-        0.06628583371639252,
-        0.1649140566587448,
-        0.068987175822258,
-        0.006291372701525688,
-        0.03244050219655037,
-        -0.049556829035282135,
-        0.050752390176057816,
-        -0.0421440489590168,
-    ]
-    for i, num in zip(
-        range(10), extracted_feature_text["text_embeds_proj"][0, 0, 10:20].tolist()
-    ):
-        assert (
-            math.isclose(
-                num, check_list_extracted_feature_text[i], rel_tol=related_error
-            )
-            is True
-        )
-
-    del model, vis_processor, txt_processor
-    cuda.empty_cache()
-
-
-@pytest.mark.parametrize(
-    ("multimodal_device"),
-    [
-        device("cpu"),
-        pytest.param(
-            device("cuda"),
-            marks=pytest.mark.skipif(
-                gpu_is_not_available, reason="gpu_is_not_availible"
-            ),
-        ),
-    ],
-)
-def test_load_feature_extractor_model_clip_base(multimodal_device):
-    my_dict = {}
-    (
-        model,
-        vis_processor,
-        txt_processor,
-    ) = ms.MultimodalSearch.load_feature_extractor_model_clip_base(
-        my_dict, multimodal_device
-    )
-    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
-    test_querry = (
-        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
-    )
-    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
-    processed_text = txt_processor["eval"](test_querry)
-
-    with no_grad():
-        extracted_feature_img = model.extract_features({"image": processed_pic})
-        extracted_feature_text = model.extract_features({"text_input": processed_text})
-
-    check_list_processed_pic = [
-        -0.7995694875717163,
-        -0.7849710583686829,
-        -0.7849710583686829,
-        -0.7703726291656494,
-        -0.7703726291656494,
-        -0.7849710583686829,
-        -0.7849710583686829,
-        -0.7703726291656494,
-        -0.7703726291656494,
-        -0.7703726291656494,
-    ]
-    for i, num in zip(range(10), processed_pic[0, 0, 0, 25:35].tolist()):
-        assert (
-            math.isclose(num, check_list_processed_pic[i], rel_tol=related_error)
-            is True
-        )
-
-    assert (
-        processed_text
-        == "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
-    )
-
-    check_list_extracted_feature_img = [
-        0.15101124346256256,
-        -0.03759124130010605,
-        -0.40093156695365906,
-        -0.32228705286979675,
-        0.1576370894908905,
-        -0.23340347409248352,
-        -0.3892208933830261,
-        0.20170584321022034,
-        -0.030034437775611877,
-        0.19082790613174438,
-    ]
-    for i, num in zip(range(10), extracted_feature_img[0, 10:20].tolist()):
-        assert (
-            math.isclose(
-                num, check_list_extracted_feature_img[i], rel_tol=related_error
-            )
-            is True
-        )
-
-    check_list_extracted_feature_text = [
-        0.15391531586647034,
-        0.3078577518463135,
-        0.21737979352474213,
-        0.0775114893913269,
-        -0.3013279139995575,
-        0.2806251049041748,
-        -0.0407320111989975,
-        -0.02664487063884735,
-        -0.1858849972486496,
-        0.20347601175308228,
-    ]
-    for i, num in zip(range(10), extracted_feature_text[0, 10:20].tolist()):
-        assert (
-            math.isclose(
-                num, check_list_extracted_feature_text[i], rel_tol=related_error
-            )
-            is True
-        )
-
-    del model, vis_processor, txt_processor
-    cuda.empty_cache()
-
-
-@pytest.mark.parametrize(
-    ("multimodal_device"),
-    [
-        device("cpu"),
-        pytest.param(
-            device("cuda"),
-            marks=pytest.mark.skipif(
-                gpu_is_not_available, reason="gpu_is_not_availible"
-            ),
-        ),
-    ],
-)
-def test_load_feature_extractor_model_clip_vitl14(multimodal_device):
-    my_dict = {}
-    (
-        model,
-        vis_processor,
-        txt_processor,
-    ) = ms.MultimodalSearch.load_feature_extractor_model_clip_vitl14(
-        my_dict, multimodal_device
-    )
-    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
-    test_querry = (
-        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
-    )
-    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
-    processed_text = txt_processor["eval"](test_querry)
-
-    with no_grad():
-        extracted_feature_img = model.extract_features({"image": processed_pic})
-        extracted_feature_text = model.extract_features({"text_input": processed_text})
-
-    check_list_processed_pic = [
-        -0.7995694875717163,
-        -0.7849710583686829,
-        -0.7849710583686829,
-        -0.7703726291656494,
-        -0.7703726291656494,
-        -0.7849710583686829,
-        -0.7849710583686829,
-        -0.7703726291656494,
-        -0.7703726291656494,
-        -0.7703726291656494,
-    ]
-    for i, num in zip(range(10), processed_pic[0, 0, 0, 25:35].tolist()):
-        assert (
-            math.isclose(num, check_list_processed_pic[i], rel_tol=related_error)
-            is True
-        )
-
-    assert (
-        processed_text
-        == "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
-    )
-
-    check_list_extracted_feature_img = [
-        -0.3911527395248413,
-        -0.35456305742263794,
-        0.5724918842315674,
-        0.3184954524040222,
-        0.23444902896881104,
-        -0.14105141162872314,
-        0.26309096813201904,
-        -0.0559774711728096,
-        0.19491413235664368,
-        0.01419895887374878,
-    ]
-    for i, num in zip(range(10), extracted_feature_img[0, 10:20].tolist()):
-        assert (
-            math.isclose(
-                num, check_list_extracted_feature_img[i], rel_tol=related_error
-            )
-            is True
-        )
-
-    check_list_extracted_feature_text = [
-        -0.07539052516222,
-        0.0939129889011383,
-        -0.2643853425979614,
-        -0.2459949105978012,
-        0.2387947291135788,
-        -0.5204038023948669,
-        -0.514020562171936,
-        -0.32557412981987,
-        0.18563221395015717,
-        -0.3183072805404663,
-    ]
-    for i, num in zip(range(10), extracted_feature_text[0, 10:20].tolist()):
-        assert (
-            math.isclose(
-                num, check_list_extracted_feature_text[i], rel_tol=related_error
-            )
-            is True
-        )
-
-    del model, vis_processor, txt_processor
-    cuda.empty_cache()
-
-
-@pytest.mark.parametrize(
-    ("multimodal_device"),
-    [
-        device("cpu"),
-        pytest.param(
-            device("cuda"),
-            marks=pytest.mark.skipif(
-                gpu_is_not_available, reason="gpu_is_not_availible"
-            ),
-        ),
-    ],
-)
-def test_load_feature_extractor_model_clip_vitl14_336(multimodal_device):
-    my_dict = {}
-    (
-        model,
-        vis_processor,
-        txt_processor,
-    ) = ms.MultimodalSearch.load_feature_extractor_model_clip_vitl14_336(
-        my_dict, multimodal_device
-    )
-    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
-    test_querry = (
-        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
-    )
-    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
-    processed_text = txt_processor["eval"](test_querry)
-
-    with no_grad():
-        extracted_feature_img = model.extract_features({"image": processed_pic})
-        extracted_feature_text = model.extract_features({"text_input": processed_text})
-
-    check_list_processed_pic = [
-        -0.7995694875717163,
-        -0.7849710583686829,
-        -0.7849710583686829,
-        -0.7849710583686829,
-        -0.7849710583686829,
-        -0.7849710583686829,
-        -0.7849710583686829,
-        -0.9163569211959839,
-        -1.149931788444519,
-        -1.0039474964141846,
-    ]
-    for i, num in zip(range(10), processed_pic[0, 0, 0, 25:35].tolist()):
-        assert (
-            math.isclose(num, check_list_processed_pic[i], rel_tol=related_error)
-            is True
-        )
-
-    assert (
-        processed_text
-        == "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
-    )
-
-    check_list_extracted_feature_img = [
-        -0.15060146152973175,
-        -0.1998099535703659,
-        0.5503129363059998,
-        0.2589969336986542,
-        -0.0182882659137249,
-        -0.12753525376319885,
-        0.018985718488693237,
-        -0.17110440135002136,
-        0.02220013737678528,
-        0.01086437702178955,
-    ]
-    for i, num in zip(range(10), extracted_feature_img[0, 10:20].tolist()):
-        assert (
-            math.isclose(
-                num, check_list_extracted_feature_img[i], rel_tol=related_error
-            )
-            is True
-        )
-
-    check_list_extracted_feature_text = [
-        -0.1172553077340126,
-        0.07105237245559692,
-        -0.283934086561203,
-        -0.24353823065757751,
-        0.22662702202796936,
-        -0.472959041595459,
-        -0.5191791653633118,
-        -0.29402273893356323,
-        0.22669515013694763,
-        -0.32044747471809387,
-    ]
-    for i, num in zip(range(10), extracted_feature_text[0, 10:20].tolist()):
-        assert (
-            math.isclose(
-                num, check_list_extracted_feature_text[i], rel_tol=related_error
-            )
-            is True
-        )
-
-    del model, vis_processor, txt_processor
-    cuda.empty_cache()
-
-
-model_type = "blip"
-# model_type = "blip2"
-# model_type = "albef"
-# model_type = "clip_base"
-# model_type = "clip_vitl14"
-# model_type = "clip_vitl14_336"
-
-
 pre_proc_pic_blip2_blip_albef = [
     -1.0039474964141846,
     -1.0039474964141846,
@@ -793,16 +153,16 @@ pre_extracted_feature_img_clip_vitl14 = [
 ]
 
 pre_extracted_feature_img_clip_vitl14_336 = [
-    -0.15060146152973175,
-    -0.1998099535703659,
-    0.5503129363059998,
-    0.2589969336986542,
-    -0.0182882659137249,
-    -0.12753525376319885,
-    0.018985718488693237,
-    -0.17110440135002136,
-    0.02220013737678528,
-    0.01086437702178955,
+    -0.009511193260550499,
+    -0.012618942186236382,
+    0.034754861146211624,
+    0.016356879845261574,
+    -0.0011549904011189938,
+    -0.008054453879594803,
+    0.0011990377679467201,
+    -0.010806051082909107,
+    0.00140204350464046,
+    0.0006861367146484554,
 ]
 
 pre_extracted_feature_text_blip2 = [
@@ -871,16 +231,16 @@ pre_extracted_feature_text_clip_vitl14 = [
 ]
 
 pre_extracted_feature_text_clip_vitl14_336 = [
-    -0.1172553077340126,
-    0.07105237245559692,
-    -0.283934086561203,
-    -0.24353823065757751,
-    0.22662702202796936,
-    -0.472959041595459,
-    -0.5191791653633118,
-    -0.29402273893356323,
-    0.22669515013694763,
-    -0.32044747471809387,
+    -0.008720514364540577,
+    0.005284308455884457,
+    -0.021116750314831734,
+    -0.018112430348992348,
+    0.01685470901429653,
+    -0.03517491742968559,
+    -0.038612402975559235,
+    -0.021867064759135246,
+    0.01685977540910244,
+    -0.023832324892282486,
 ]
 
 
@@ -991,11 +351,27 @@ pre_extracted_feature_text_clip_vitl14_336 = [
                 gpu_is_not_available, reason="gpu_is_not_availible"
             ),
         ),
-        #        (device("cpu"),"clip_vitl14_336"),
-        #
-        #
-        #
-        #        pytest.param( device("cuda"),"clip_vitl14_336", marks=pytest.mark.skipif(gpu_is_not_available, reason="gpu_is_not_availible"),),
+        (
+            device("cpu"),
+            "clip_vitl14_336",
+            pre_proc_pic_clip_vitl14_336,
+            pre_proc_text_clip_clip_vitl14_clip_vitl14_336,
+            pre_extracted_feature_img_clip_vitl14_336,
+            pre_extracted_feature_text_clip_vitl14_336,
+            pre_extracted_feature_img_clip_vitl14_336,
+        ),
+        pytest.param(
+            device("cuda"),
+            "clip_vitl14_336",
+            pre_proc_pic_clip_vitl14_336,
+            pre_proc_text_clip_clip_vitl14_clip_vitl14_336,
+            pre_extracted_feature_img_clip_vitl14_336,
+            pre_extracted_feature_text_clip_vitl14_336,
+            pre_extracted_feature_img_clip_vitl14_336,
+            marks=pytest.mark.skipif(
+                gpu_is_not_available, reason="gpu_is_not_availible"
+            ),
+        ),
     ],
 )
 def test_parsing_images(