From baa6884a87892a72c761e24062d0d12f3791c94a Mon Sep 17 00:00:00 2001 From: Petr Andriushchenko Date: Wed, 1 Mar 2023 15:59:44 +0100 Subject: [PATCH] removed windows in CI and added test in multimodal search --- .github/workflows/ci.yml | 2 +- misinformation/test/test_multimodal_search.py | 432 +++++++++++++++++- 2 files changed, 428 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d8f5e53..234d32c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-22.04, windows-latest] + os: [ubuntu-22.04] python-version: [3.9] steps: - name: Checkout repository diff --git a/misinformation/test/test_multimodal_search.py b/misinformation/test/test_multimodal_search.py index 3917898..1151fad 100644 --- a/misinformation/test/test_multimodal_search.py +++ b/misinformation/test/test_multimodal_search.py @@ -1,6 +1,6 @@ -import os +from PIL import Image import numpy -from torch import device, cuda +from torch import device, cuda, no_grad from lavis.models import load_model_and_preprocess import misinformation.multimodal_search as ms @@ -24,6 +24,428 @@ def test_read_img(): assert list(numpy.array(test_img)[257][34]) == [70, 66, 63] -# def test_load_feature_extractor_model_blip2(): -# multimodal_device = device("cuda" if cuda.is_available() else "cpu") -# (model, vis_processors, txt_processors,) = ms.load_feature_extractor_model_blip2(multimodal_device) +def test_load_feature_extractor_model_blip2(): + my_dict = {} + multimodal_device = device("cuda" if cuda.is_available() else "cpu") + ( + model, + vis_processor, + txt_processor, + ) = ms.MultimodalSearch.load_feature_extractor_model_blip2( + my_dict, multimodal_device + ) + test_pic = Image.open(TEST_IMAGE_2).convert("RGB") + test_querry = ( + "The bird sat on a tree located at the intersection of 23rd and 43rd streets." + ) + processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device) + processed_text = txt_processor["eval"](test_querry) + + with no_grad(): + with cuda.amp.autocast(enabled=(device != device("cpu"))): + extracted_feature_img = model.extract_features( + {"image": processed_pic, "text_input": ""}, mode="image" + ) + extracted_feature_text = model.extract_features( + {"image": "", "text_input": processed_text}, mode="text" + ) + + assert processed_pic[0, 0, 0, 25:35].tolist() == [ + -1.0039474964141846, + -1.0039474964141846, + -0.8433647751808167, + -0.6097899675369263, + -0.5951915383338928, + -0.6243883967399597, + -0.6827820539474487, + -0.6097899675369263, + -0.7119789123535156, + -1.0623412132263184, + ] + + assert ( + processed_text + == "the bird sat on a tree located at the intersection of 23rd and 43rd streets" + ) + + assert extracted_feature_img["image_embeds_proj"][0, 0, 10:20].tolist() == [ + 0.04566730558872223, + -0.042554520070552826, + -0.06970272958278656, + -0.009771779179573059, + 0.01446065679192543, + 0.10173682868480682, + 0.007092420011758804, + -0.020045937970280647, + 0.12923966348171234, + 0.006452132016420364, + ] + + assert extracted_feature_text["text_embeds_proj"][0, 0, 10:20].tolist() == [ + -0.1384519338607788, + -0.008663734421133995, + 0.006240826100111008, + 0.031466349959373474, + 0.060625165700912476, + -0.03230545297265053, + 0.01585903950035572, + -0.11856520175933838, + -0.05823372304439545, + 0.036941494792699814, + ] + + del model, vis_processor, txt_processor + cuda.empty_cache() + + +def test_load_feature_extractor_model_blip(): + my_dict = {} + multimodal_device = device("cuda" if cuda.is_available() else "cpu") + ( + model, + vis_processor, + txt_processor, + ) = ms.MultimodalSearch.load_feature_extractor_model_blip( + my_dict, multimodal_device + ) + test_pic = Image.open(TEST_IMAGE_2).convert("RGB") + test_querry = ( + "The bird sat on a tree located at the intersection of 23rd and 43rd streets." + ) + processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device) + processed_text = txt_processor["eval"](test_querry) + + with no_grad(): + extracted_feature_img = model.extract_features( + {"image": processed_pic, "text_input": ""}, mode="image" + ) + extracted_feature_text = model.extract_features( + {"image": "", "text_input": processed_text}, mode="text" + ) + + assert processed_pic[0, 0, 0, 25:35].tolist() == [ + -1.0039474964141846, + -1.0039474964141846, + -0.8433647751808167, + -0.6097899675369263, + -0.5951915383338928, + -0.6243883967399597, + -0.6827820539474487, + -0.6097899675369263, + -0.7119789123535156, + -1.0623412132263184, + ] + + assert ( + processed_text + == "the bird sat on a tree located at the intersection of 23rd and 43rd streets" + ) + + assert extracted_feature_img["image_embeds_proj"][0, 0, 10:20].tolist() == [ + -0.02480311505496502, + 0.05037587881088257, + 0.039517853409051895, + -0.06994109600782394, + -0.12886561453342438, + 0.047039758414030075, + -0.11620642244815826, + -0.003398326924070716, + -0.07324369996786118, + 0.06994668394327164, + ] + + assert extracted_feature_text["text_embeds_proj"][0, 0, 10:20].tolist() == [ + 0.0118643119931221, + -0.01291718054562807, + -0.0009687161073088646, + 0.01428765058517456, + -0.05591396614909172, + 0.07386433333158493, + -0.11475936323404312, + 0.01620068959891796, + 0.0062415082938969135, + 0.0034833091776818037, + ] + + del model, vis_processor, txt_processor + cuda.empty_cache() + + +def test_load_feature_extractor_model_albef(): + my_dict = {} + multimodal_device = device("cuda" if cuda.is_available() else "cpu") + ( + model, + vis_processor, + txt_processor, + ) = ms.MultimodalSearch.load_feature_extractor_model_albef( + my_dict, multimodal_device + ) + test_pic = Image.open(TEST_IMAGE_2).convert("RGB") + test_querry = ( + "The bird sat on a tree located at the intersection of 23rd and 43rd streets." + ) + processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device) + processed_text = txt_processor["eval"](test_querry) + + with no_grad(): + extracted_feature_img = model.extract_features( + {"image": processed_pic, "text_input": ""}, mode="image" + ) + extracted_feature_text = model.extract_features( + {"image": "", "text_input": processed_text}, mode="text" + ) + + assert processed_pic[0, 0, 0, 25:35].tolist() == [ + -1.0039474964141846, + -1.0039474964141846, + -0.8433647751808167, + -0.6097899675369263, + -0.5951915383338928, + -0.6243883967399597, + -0.6827820539474487, + -0.6097899675369263, + -0.7119789123535156, + -1.0623412132263184, + ] + + assert ( + processed_text + == "the bird sat on a tree located at the intersection of 23rd and 43rd streets" + ) + + assert extracted_feature_img["image_embeds_proj"][0, 0, 10:20].tolist() == [ + 0.08971136063337326, + -0.10915573686361313, + -0.020636577159166336, + 0.048121627420186996, + -0.05943416804075241, + -0.129856139421463, + -0.0034469354432076216, + 0.017888527363538742, + -0.03284582123160362, + -0.1037328764796257, + ] + + assert extracted_feature_text["text_embeds_proj"][0, 0, 10:20].tolist() == [ + -0.06229640915989876, + 0.11278597265481949, + 0.06628583371639252, + 0.1649140566587448, + 0.068987175822258, + 0.006291372701525688, + 0.03244050219655037, + -0.049556829035282135, + 0.050752390176057816, + -0.0421440489590168, + ] + + del model, vis_processor, txt_processor + cuda.empty_cache() + + +def test_load_feature_extractor_model_clip_base(): + my_dict = {} + multimodal_device = device("cuda" if cuda.is_available() else "cpu") + ( + model, + vis_processor, + txt_processor, + ) = ms.MultimodalSearch.load_feature_extractor_model_clip_base( + my_dict, multimodal_device + ) + test_pic = Image.open(TEST_IMAGE_2).convert("RGB") + test_querry = ( + "The bird sat on a tree located at the intersection of 23rd and 43rd streets." + ) + processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device) + processed_text = txt_processor["eval"](test_querry) + + with no_grad(): + extracted_feature_img = model.extract_features({"image": processed_pic}) + extracted_feature_text = model.extract_features({"text_input": processed_text}) + + assert processed_pic[0, 0, 0, 25:35].tolist() == [ + -0.7995694875717163, + -0.7849710583686829, + -0.7849710583686829, + -0.7703726291656494, + -0.7703726291656494, + -0.7849710583686829, + -0.7849710583686829, + -0.7703726291656494, + -0.7703726291656494, + -0.7703726291656494, + ] + + assert ( + processed_text + == "The bird sat on a tree located at the intersection of 23rd and 43rd streets." + ) + + assert extracted_feature_img[0, 10:20].tolist() == [ + 0.15101124346256256, + -0.03759124130010605, + -0.40093156695365906, + -0.32228705286979675, + 0.1576370894908905, + -0.23340347409248352, + -0.3892208933830261, + 0.20170584321022034, + -0.030034437775611877, + 0.19082790613174438, + ] + + assert extracted_feature_text[0, 10:20].tolist() == [ + 0.15391531586647034, + 0.3078577518463135, + 0.21737979352474213, + 0.0775114893913269, + -0.3013279139995575, + 0.2806251049041748, + -0.0407320111989975, + -0.02664487063884735, + -0.1858849972486496, + 0.20347601175308228, + ] + + del model, vis_processor, txt_processor + cuda.empty_cache() + + +def test_load_feature_extractor_model_clip_vitl14(): + my_dict = {} + multimodal_device = device("cuda" if cuda.is_available() else "cpu") + ( + model, + vis_processor, + txt_processor, + ) = ms.MultimodalSearch.load_feature_extractor_model_clip_vitl14( + my_dict, multimodal_device + ) + test_pic = Image.open(TEST_IMAGE_2).convert("RGB") + test_querry = ( + "The bird sat on a tree located at the intersection of 23rd and 43rd streets." + ) + processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device) + processed_text = txt_processor["eval"](test_querry) + + with no_grad(): + extracted_feature_img = model.extract_features({"image": processed_pic}) + extracted_feature_text = model.extract_features({"text_input": processed_text}) + + assert processed_pic[0, 0, 0, 25:35].tolist() == [ + -0.7995694875717163, + -0.7849710583686829, + -0.7849710583686829, + -0.7703726291656494, + -0.7703726291656494, + -0.7849710583686829, + -0.7849710583686829, + -0.7703726291656494, + -0.7703726291656494, + -0.7703726291656494, + ] + + assert ( + processed_text + == "The bird sat on a tree located at the intersection of 23rd and 43rd streets." + ) + + assert extracted_feature_img[0, 10:20].tolist() == [ + -0.3911527395248413, + -0.35456305742263794, + 0.5724918842315674, + 0.3184954524040222, + 0.23444902896881104, + -0.14105141162872314, + 0.26309096813201904, + -0.0559774711728096, + 0.19491413235664368, + 0.01419895887374878, + ] + + assert extracted_feature_text[0, 10:20].tolist() == [ + -0.07539052516222, + 0.0939129889011383, + -0.2643853425979614, + -0.2459949105978012, + 0.2387947291135788, + -0.5204038023948669, + -0.514020562171936, + -0.32557412981987, + 0.18563221395015717, + -0.3183072805404663, + ] + + del model, vis_processor, txt_processor + cuda.empty_cache() + + +def test_load_feature_extractor_model_clip_vitl14_336(): + my_dict = {} + multimodal_device = device("cuda" if cuda.is_available() else "cpu") + ( + model, + vis_processor, + txt_processor, + ) = ms.MultimodalSearch.load_feature_extractor_model_clip_vitl14_336( + my_dict, multimodal_device + ) + test_pic = Image.open(TEST_IMAGE_2).convert("RGB") + test_querry = ( + "The bird sat on a tree located at the intersection of 23rd and 43rd streets." + ) + processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device) + processed_text = txt_processor["eval"](test_querry) + + with no_grad(): + extracted_feature_img = model.extract_features({"image": processed_pic}) + extracted_feature_text = model.extract_features({"text_input": processed_text}) + + assert processed_pic[0, 0, 0, 25:35].tolist() == [ + -0.7995694875717163, + -0.7849710583686829, + -0.7849710583686829, + -0.7849710583686829, + -0.7849710583686829, + -0.7849710583686829, + -0.7849710583686829, + -0.9163569211959839, + -1.149931788444519, + -1.0039474964141846, + ] + + assert ( + processed_text + == "The bird sat on a tree located at the intersection of 23rd and 43rd streets." + ) + + assert extracted_feature_img[0, 10:20].tolist() == [ + -0.15060146152973175, + -0.1998099535703659, + 0.5503129363059998, + 0.2589969336986542, + -0.0182882659137249, + -0.12753525376319885, + 0.018985718488693237, + -0.17110440135002136, + 0.02220013737678528, + 0.01086437702178955, + ] + + assert extracted_feature_text[0, 10:20].tolist() == [ + -0.1172553077340126, + 0.07105237245559692, + -0.283934086561203, + -0.24353823065757751, + 0.22662702202796936, + -0.472959041595459, + -0.5191791653633118, + -0.29402273893356323, + 0.22669515013694763, + -0.32044747471809387, + ] + + del model, vis_processor, txt_processor + cuda.empty_cache()