removed windows in CI and added test in multimodal search

2025-10-30 21:46:04 +02:00 · 2023-03-01 15:59:44 +01:00 · 2023-03-01 15:59:44 +01:00 · baa6884a87
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -14,7 +14,7 @@ jobs:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ubuntu-22.04, windows-latest]
+        os: [ubuntu-22.04]
        python-version: [3.9]
    steps:
    - name: Checkout repository
--- a/misinformation/test/test_multimodal_search.py
+++ b/misinformation/test/test_multimodal_search.py
@ -1,6 +1,6 @@
-import os
+from PIL import Image
 import numpy
-from torch import device, cuda
+from torch import device, cuda, no_grad
 from lavis.models import load_model_and_preprocess
 import misinformation.multimodal_search as ms

@ -24,6 +24,428 @@ def test_read_img():
    assert list(numpy.array(test_img)[257][34]) == [70, 66, 63]


-# def test_load_feature_extractor_model_blip2():
-#    multimodal_device = device("cuda" if cuda.is_available() else "cpu")
-#    (model, vis_processors, txt_processors,) = ms.load_feature_extractor_model_blip2(multimodal_device)
+def test_load_feature_extractor_model_blip2():
+    my_dict = {}
+    multimodal_device = device("cuda" if cuda.is_available() else "cpu")
+    (
+        model,
+        vis_processor,
+        txt_processor,
+    ) = ms.MultimodalSearch.load_feature_extractor_model_blip2(
+        my_dict, multimodal_device
+    )
+    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
+    test_querry = (
+        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
+    )
+    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
+    processed_text = txt_processor["eval"](test_querry)
+
+    with no_grad():
+        with cuda.amp.autocast(enabled=(device != device("cpu"))):
+            extracted_feature_img = model.extract_features(
+                {"image": processed_pic, "text_input": ""}, mode="image"
+            )
+            extracted_feature_text = model.extract_features(
+                {"image": "", "text_input": processed_text}, mode="text"
+            )
+
+    assert processed_pic[0, 0, 0, 25:35].tolist() == [
+        -1.0039474964141846,
+        -1.0039474964141846,
+        -0.8433647751808167,
+        -0.6097899675369263,
+        -0.5951915383338928,
+        -0.6243883967399597,
+        -0.6827820539474487,
+        -0.6097899675369263,
+        -0.7119789123535156,
+        -1.0623412132263184,
+    ]
+
+    assert (
+        processed_text
+        == "the bird sat on a tree located at the intersection of 23rd and 43rd streets"
+    )
+
+    assert extracted_feature_img["image_embeds_proj"][0, 0, 10:20].tolist() == [
+        0.04566730558872223,
+        -0.042554520070552826,
+        -0.06970272958278656,
+        -0.009771779179573059,
+        0.01446065679192543,
+        0.10173682868480682,
+        0.007092420011758804,
+        -0.020045937970280647,
+        0.12923966348171234,
+        0.006452132016420364,
+    ]
+
+    assert extracted_feature_text["text_embeds_proj"][0, 0, 10:20].tolist() == [
+        -0.1384519338607788,
+        -0.008663734421133995,
+        0.006240826100111008,
+        0.031466349959373474,
+        0.060625165700912476,
+        -0.03230545297265053,
+        0.01585903950035572,
+        -0.11856520175933838,
+        -0.05823372304439545,
+        0.036941494792699814,
+    ]
+
+    del model, vis_processor, txt_processor
+    cuda.empty_cache()
+
+
+def test_load_feature_extractor_model_blip():
+    my_dict = {}
+    multimodal_device = device("cuda" if cuda.is_available() else "cpu")
+    (
+        model,
+        vis_processor,
+        txt_processor,
+    ) = ms.MultimodalSearch.load_feature_extractor_model_blip(
+        my_dict, multimodal_device
+    )
+    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
+    test_querry = (
+        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
+    )
+    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
+    processed_text = txt_processor["eval"](test_querry)
+
+    with no_grad():
+        extracted_feature_img = model.extract_features(
+            {"image": processed_pic, "text_input": ""}, mode="image"
+        )
+        extracted_feature_text = model.extract_features(
+            {"image": "", "text_input": processed_text}, mode="text"
+        )
+
+    assert processed_pic[0, 0, 0, 25:35].tolist() == [
+        -1.0039474964141846,
+        -1.0039474964141846,
+        -0.8433647751808167,
+        -0.6097899675369263,
+        -0.5951915383338928,
+        -0.6243883967399597,
+        -0.6827820539474487,
+        -0.6097899675369263,
+        -0.7119789123535156,
+        -1.0623412132263184,
+    ]
+
+    assert (
+        processed_text
+        == "the bird sat on a tree located at the intersection of 23rd and 43rd streets"
+    )
+
+    assert extracted_feature_img["image_embeds_proj"][0, 0, 10:20].tolist() == [
+        -0.02480311505496502,
+        0.05037587881088257,
+        0.039517853409051895,
+        -0.06994109600782394,
+        -0.12886561453342438,
+        0.047039758414030075,
+        -0.11620642244815826,
+        -0.003398326924070716,
+        -0.07324369996786118,
+        0.06994668394327164,
+    ]
+
+    assert extracted_feature_text["text_embeds_proj"][0, 0, 10:20].tolist() == [
+        0.0118643119931221,
+        -0.01291718054562807,
+        -0.0009687161073088646,
+        0.01428765058517456,
+        -0.05591396614909172,
+        0.07386433333158493,
+        -0.11475936323404312,
+        0.01620068959891796,
+        0.0062415082938969135,
+        0.0034833091776818037,
+    ]
+
+    del model, vis_processor, txt_processor
+    cuda.empty_cache()
+
+
+def test_load_feature_extractor_model_albef():
+    my_dict = {}
+    multimodal_device = device("cuda" if cuda.is_available() else "cpu")
+    (
+        model,
+        vis_processor,
+        txt_processor,
+    ) = ms.MultimodalSearch.load_feature_extractor_model_albef(
+        my_dict, multimodal_device
+    )
+    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
+    test_querry = (
+        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
+    )
+    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
+    processed_text = txt_processor["eval"](test_querry)
+
+    with no_grad():
+        extracted_feature_img = model.extract_features(
+            {"image": processed_pic, "text_input": ""}, mode="image"
+        )
+        extracted_feature_text = model.extract_features(
+            {"image": "", "text_input": processed_text}, mode="text"
+        )
+
+    assert processed_pic[0, 0, 0, 25:35].tolist() == [
+        -1.0039474964141846,
+        -1.0039474964141846,
+        -0.8433647751808167,
+        -0.6097899675369263,
+        -0.5951915383338928,
+        -0.6243883967399597,
+        -0.6827820539474487,
+        -0.6097899675369263,
+        -0.7119789123535156,
+        -1.0623412132263184,
+    ]
+
+    assert (
+        processed_text
+        == "the bird sat on a tree located at the intersection of 23rd and 43rd streets"
+    )
+
+    assert extracted_feature_img["image_embeds_proj"][0, 0, 10:20].tolist() == [
+        0.08971136063337326,
+        -0.10915573686361313,
+        -0.020636577159166336,
+        0.048121627420186996,
+        -0.05943416804075241,
+        -0.129856139421463,
+        -0.0034469354432076216,
+        0.017888527363538742,
+        -0.03284582123160362,
+        -0.1037328764796257,
+    ]
+
+    assert extracted_feature_text["text_embeds_proj"][0, 0, 10:20].tolist() == [
+        -0.06229640915989876,
+        0.11278597265481949,
+        0.06628583371639252,
+        0.1649140566587448,
+        0.068987175822258,
+        0.006291372701525688,
+        0.03244050219655037,
+        -0.049556829035282135,
+        0.050752390176057816,
+        -0.0421440489590168,
+    ]
+
+    del model, vis_processor, txt_processor
+    cuda.empty_cache()
+
+
+def test_load_feature_extractor_model_clip_base():
+    my_dict = {}
+    multimodal_device = device("cuda" if cuda.is_available() else "cpu")
+    (
+        model,
+        vis_processor,
+        txt_processor,
+    ) = ms.MultimodalSearch.load_feature_extractor_model_clip_base(
+        my_dict, multimodal_device
+    )
+    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
+    test_querry = (
+        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
+    )
+    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
+    processed_text = txt_processor["eval"](test_querry)
+
+    with no_grad():
+        extracted_feature_img = model.extract_features({"image": processed_pic})
+        extracted_feature_text = model.extract_features({"text_input": processed_text})
+
+    assert processed_pic[0, 0, 0, 25:35].tolist() == [
+        -0.7995694875717163,
+        -0.7849710583686829,
+        -0.7849710583686829,
+        -0.7703726291656494,
+        -0.7703726291656494,
+        -0.7849710583686829,
+        -0.7849710583686829,
+        -0.7703726291656494,
+        -0.7703726291656494,
+        -0.7703726291656494,
+    ]
+
+    assert (
+        processed_text
+        == "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
+    )
+
+    assert extracted_feature_img[0, 10:20].tolist() == [
+        0.15101124346256256,
+        -0.03759124130010605,
+        -0.40093156695365906,
+        -0.32228705286979675,
+        0.1576370894908905,
+        -0.23340347409248352,
+        -0.3892208933830261,
+        0.20170584321022034,
+        -0.030034437775611877,
+        0.19082790613174438,
+    ]
+
+    assert extracted_feature_text[0, 10:20].tolist() == [
+        0.15391531586647034,
+        0.3078577518463135,
+        0.21737979352474213,
+        0.0775114893913269,
+        -0.3013279139995575,
+        0.2806251049041748,
+        -0.0407320111989975,
+        -0.02664487063884735,
+        -0.1858849972486496,
+        0.20347601175308228,
+    ]
+
+    del model, vis_processor, txt_processor
+    cuda.empty_cache()
+
+
+def test_load_feature_extractor_model_clip_vitl14():
+    my_dict = {}
+    multimodal_device = device("cuda" if cuda.is_available() else "cpu")
+    (
+        model,
+        vis_processor,
+        txt_processor,
+    ) = ms.MultimodalSearch.load_feature_extractor_model_clip_vitl14(
+        my_dict, multimodal_device
+    )
+    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
+    test_querry = (
+        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
+    )
+    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
+    processed_text = txt_processor["eval"](test_querry)
+
+    with no_grad():
+        extracted_feature_img = model.extract_features({"image": processed_pic})
+        extracted_feature_text = model.extract_features({"text_input": processed_text})
+
+    assert processed_pic[0, 0, 0, 25:35].tolist() == [
+        -0.7995694875717163,
+        -0.7849710583686829,
+        -0.7849710583686829,
+        -0.7703726291656494,
+        -0.7703726291656494,
+        -0.7849710583686829,
+        -0.7849710583686829,
+        -0.7703726291656494,
+        -0.7703726291656494,
+        -0.7703726291656494,
+    ]
+
+    assert (
+        processed_text
+        == "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
+    )
+
+    assert extracted_feature_img[0, 10:20].tolist() == [
+        -0.3911527395248413,
+        -0.35456305742263794,
+        0.5724918842315674,
+        0.3184954524040222,
+        0.23444902896881104,
+        -0.14105141162872314,
+        0.26309096813201904,
+        -0.0559774711728096,
+        0.19491413235664368,
+        0.01419895887374878,
+    ]
+
+    assert extracted_feature_text[0, 10:20].tolist() == [
+        -0.07539052516222,
+        0.0939129889011383,
+        -0.2643853425979614,
+        -0.2459949105978012,
+        0.2387947291135788,
+        -0.5204038023948669,
+        -0.514020562171936,
+        -0.32557412981987,
+        0.18563221395015717,
+        -0.3183072805404663,
+    ]
+
+    del model, vis_processor, txt_processor
+    cuda.empty_cache()
+
+
+def test_load_feature_extractor_model_clip_vitl14_336():
+    my_dict = {}
+    multimodal_device = device("cuda" if cuda.is_available() else "cpu")
+    (
+        model,
+        vis_processor,
+        txt_processor,
+    ) = ms.MultimodalSearch.load_feature_extractor_model_clip_vitl14_336(
+        my_dict, multimodal_device
+    )
+    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
+    test_querry = (
+        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
+    )
+    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
+    processed_text = txt_processor["eval"](test_querry)
+
+    with no_grad():
+        extracted_feature_img = model.extract_features({"image": processed_pic})
+        extracted_feature_text = model.extract_features({"text_input": processed_text})
+
+    assert processed_pic[0, 0, 0, 25:35].tolist() == [
+        -0.7995694875717163,
+        -0.7849710583686829,
+        -0.7849710583686829,
+        -0.7849710583686829,
+        -0.7849710583686829,
+        -0.7849710583686829,
+        -0.7849710583686829,
+        -0.9163569211959839,
+        -1.149931788444519,
+        -1.0039474964141846,
+    ]
+
+    assert (
+        processed_text
+        == "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
+    )
+
+    assert extracted_feature_img[0, 10:20].tolist() == [
+        -0.15060146152973175,
+        -0.1998099535703659,
+        0.5503129363059998,
+        0.2589969336986542,
+        -0.0182882659137249,
+        -0.12753525376319885,
+        0.018985718488693237,
+        -0.17110440135002136,
+        0.02220013737678528,
+        0.01086437702178955,
+    ]
+
+    assert extracted_feature_text[0, 10:20].tolist() == [
+        -0.1172553077340126,
+        0.07105237245559692,
+        -0.283934086561203,
+        -0.24353823065757751,
+        0.22662702202796936,
+        -0.472959041595459,
+        -0.5191791653633118,
+        -0.29402273893356323,
+        0.22669515013694763,
+        -0.32044747471809387,
+    ]
+
+    del model, vis_processor, txt_processor
+    cuda.empty_cache()