зеркало из
				https://github.com/ssciwr/AMMICO.git
				synced 2025-11-03 23:46:04 +02:00 
			
		
		
		
	removed windows in CI and added test in multimodal search
Этот коммит содержится в:
		
							родитель
							
								
									31b006311a
								
							
						
					
					
						Коммит
						baa6884a87
					
				
							
								
								
									
										2
									
								
								.github/workflows/ci.yml
									
									
									
									
										поставляемый
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/ci.yml
									
									
									
									
										поставляемый
									
									
								
							@ -14,7 +14,7 @@ jobs:
 | 
				
			|||||||
    runs-on: ${{ matrix.os }}
 | 
					    runs-on: ${{ matrix.os }}
 | 
				
			||||||
    strategy:
 | 
					    strategy:
 | 
				
			||||||
      matrix:
 | 
					      matrix:
 | 
				
			||||||
        os: [ubuntu-22.04, windows-latest]
 | 
					        os: [ubuntu-22.04]
 | 
				
			||||||
        python-version: [3.9]
 | 
					        python-version: [3.9]
 | 
				
			||||||
    steps:
 | 
					    steps:
 | 
				
			||||||
    - name: Checkout repository
 | 
					    - name: Checkout repository
 | 
				
			||||||
 | 
				
			|||||||
@ -1,6 +1,6 @@
 | 
				
			|||||||
import os
 | 
					from PIL import Image
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
from torch import device, cuda
 | 
					from torch import device, cuda, no_grad
 | 
				
			||||||
from lavis.models import load_model_and_preprocess
 | 
					from lavis.models import load_model_and_preprocess
 | 
				
			||||||
import misinformation.multimodal_search as ms
 | 
					import misinformation.multimodal_search as ms
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -24,6 +24,428 @@ def test_read_img():
 | 
				
			|||||||
    assert list(numpy.array(test_img)[257][34]) == [70, 66, 63]
 | 
					    assert list(numpy.array(test_img)[257][34]) == [70, 66, 63]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# def test_load_feature_extractor_model_blip2():
 | 
					def test_load_feature_extractor_model_blip2():
 | 
				
			||||||
#    multimodal_device = device("cuda" if cuda.is_available() else "cpu")
 | 
					    my_dict = {}
 | 
				
			||||||
#    (model, vis_processors, txt_processors,) = ms.load_feature_extractor_model_blip2(multimodal_device)
 | 
					    multimodal_device = device("cuda" if cuda.is_available() else "cpu")
 | 
				
			||||||
 | 
					    (
 | 
				
			||||||
 | 
					        model,
 | 
				
			||||||
 | 
					        vis_processor,
 | 
				
			||||||
 | 
					        txt_processor,
 | 
				
			||||||
 | 
					    ) = ms.MultimodalSearch.load_feature_extractor_model_blip2(
 | 
				
			||||||
 | 
					        my_dict, multimodal_device
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
 | 
				
			||||||
 | 
					    test_querry = (
 | 
				
			||||||
 | 
					        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
 | 
				
			||||||
 | 
					    processed_text = txt_processor["eval"](test_querry)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with no_grad():
 | 
				
			||||||
 | 
					        with cuda.amp.autocast(enabled=(device != device("cpu"))):
 | 
				
			||||||
 | 
					            extracted_feature_img = model.extract_features(
 | 
				
			||||||
 | 
					                {"image": processed_pic, "text_input": ""}, mode="image"
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            extracted_feature_text = model.extract_features(
 | 
				
			||||||
 | 
					                {"image": "", "text_input": processed_text}, mode="text"
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert processed_pic[0, 0, 0, 25:35].tolist() == [
 | 
				
			||||||
 | 
					        -1.0039474964141846,
 | 
				
			||||||
 | 
					        -1.0039474964141846,
 | 
				
			||||||
 | 
					        -0.8433647751808167,
 | 
				
			||||||
 | 
					        -0.6097899675369263,
 | 
				
			||||||
 | 
					        -0.5951915383338928,
 | 
				
			||||||
 | 
					        -0.6243883967399597,
 | 
				
			||||||
 | 
					        -0.6827820539474487,
 | 
				
			||||||
 | 
					        -0.6097899675369263,
 | 
				
			||||||
 | 
					        -0.7119789123535156,
 | 
				
			||||||
 | 
					        -1.0623412132263184,
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert (
 | 
				
			||||||
 | 
					        processed_text
 | 
				
			||||||
 | 
					        == "the bird sat on a tree located at the intersection of 23rd and 43rd streets"
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert extracted_feature_img["image_embeds_proj"][0, 0, 10:20].tolist() == [
 | 
				
			||||||
 | 
					        0.04566730558872223,
 | 
				
			||||||
 | 
					        -0.042554520070552826,
 | 
				
			||||||
 | 
					        -0.06970272958278656,
 | 
				
			||||||
 | 
					        -0.009771779179573059,
 | 
				
			||||||
 | 
					        0.01446065679192543,
 | 
				
			||||||
 | 
					        0.10173682868480682,
 | 
				
			||||||
 | 
					        0.007092420011758804,
 | 
				
			||||||
 | 
					        -0.020045937970280647,
 | 
				
			||||||
 | 
					        0.12923966348171234,
 | 
				
			||||||
 | 
					        0.006452132016420364,
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert extracted_feature_text["text_embeds_proj"][0, 0, 10:20].tolist() == [
 | 
				
			||||||
 | 
					        -0.1384519338607788,
 | 
				
			||||||
 | 
					        -0.008663734421133995,
 | 
				
			||||||
 | 
					        0.006240826100111008,
 | 
				
			||||||
 | 
					        0.031466349959373474,
 | 
				
			||||||
 | 
					        0.060625165700912476,
 | 
				
			||||||
 | 
					        -0.03230545297265053,
 | 
				
			||||||
 | 
					        0.01585903950035572,
 | 
				
			||||||
 | 
					        -0.11856520175933838,
 | 
				
			||||||
 | 
					        -0.05823372304439545,
 | 
				
			||||||
 | 
					        0.036941494792699814,
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    del model, vis_processor, txt_processor
 | 
				
			||||||
 | 
					    cuda.empty_cache()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_load_feature_extractor_model_blip():
 | 
				
			||||||
 | 
					    my_dict = {}
 | 
				
			||||||
 | 
					    multimodal_device = device("cuda" if cuda.is_available() else "cpu")
 | 
				
			||||||
 | 
					    (
 | 
				
			||||||
 | 
					        model,
 | 
				
			||||||
 | 
					        vis_processor,
 | 
				
			||||||
 | 
					        txt_processor,
 | 
				
			||||||
 | 
					    ) = ms.MultimodalSearch.load_feature_extractor_model_blip(
 | 
				
			||||||
 | 
					        my_dict, multimodal_device
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
 | 
				
			||||||
 | 
					    test_querry = (
 | 
				
			||||||
 | 
					        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
 | 
				
			||||||
 | 
					    processed_text = txt_processor["eval"](test_querry)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with no_grad():
 | 
				
			||||||
 | 
					        extracted_feature_img = model.extract_features(
 | 
				
			||||||
 | 
					            {"image": processed_pic, "text_input": ""}, mode="image"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        extracted_feature_text = model.extract_features(
 | 
				
			||||||
 | 
					            {"image": "", "text_input": processed_text}, mode="text"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert processed_pic[0, 0, 0, 25:35].tolist() == [
 | 
				
			||||||
 | 
					        -1.0039474964141846,
 | 
				
			||||||
 | 
					        -1.0039474964141846,
 | 
				
			||||||
 | 
					        -0.8433647751808167,
 | 
				
			||||||
 | 
					        -0.6097899675369263,
 | 
				
			||||||
 | 
					        -0.5951915383338928,
 | 
				
			||||||
 | 
					        -0.6243883967399597,
 | 
				
			||||||
 | 
					        -0.6827820539474487,
 | 
				
			||||||
 | 
					        -0.6097899675369263,
 | 
				
			||||||
 | 
					        -0.7119789123535156,
 | 
				
			||||||
 | 
					        -1.0623412132263184,
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert (
 | 
				
			||||||
 | 
					        processed_text
 | 
				
			||||||
 | 
					        == "the bird sat on a tree located at the intersection of 23rd and 43rd streets"
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert extracted_feature_img["image_embeds_proj"][0, 0, 10:20].tolist() == [
 | 
				
			||||||
 | 
					        -0.02480311505496502,
 | 
				
			||||||
 | 
					        0.05037587881088257,
 | 
				
			||||||
 | 
					        0.039517853409051895,
 | 
				
			||||||
 | 
					        -0.06994109600782394,
 | 
				
			||||||
 | 
					        -0.12886561453342438,
 | 
				
			||||||
 | 
					        0.047039758414030075,
 | 
				
			||||||
 | 
					        -0.11620642244815826,
 | 
				
			||||||
 | 
					        -0.003398326924070716,
 | 
				
			||||||
 | 
					        -0.07324369996786118,
 | 
				
			||||||
 | 
					        0.06994668394327164,
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert extracted_feature_text["text_embeds_proj"][0, 0, 10:20].tolist() == [
 | 
				
			||||||
 | 
					        0.0118643119931221,
 | 
				
			||||||
 | 
					        -0.01291718054562807,
 | 
				
			||||||
 | 
					        -0.0009687161073088646,
 | 
				
			||||||
 | 
					        0.01428765058517456,
 | 
				
			||||||
 | 
					        -0.05591396614909172,
 | 
				
			||||||
 | 
					        0.07386433333158493,
 | 
				
			||||||
 | 
					        -0.11475936323404312,
 | 
				
			||||||
 | 
					        0.01620068959891796,
 | 
				
			||||||
 | 
					        0.0062415082938969135,
 | 
				
			||||||
 | 
					        0.0034833091776818037,
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    del model, vis_processor, txt_processor
 | 
				
			||||||
 | 
					    cuda.empty_cache()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_load_feature_extractor_model_albef():
 | 
				
			||||||
 | 
					    my_dict = {}
 | 
				
			||||||
 | 
					    multimodal_device = device("cuda" if cuda.is_available() else "cpu")
 | 
				
			||||||
 | 
					    (
 | 
				
			||||||
 | 
					        model,
 | 
				
			||||||
 | 
					        vis_processor,
 | 
				
			||||||
 | 
					        txt_processor,
 | 
				
			||||||
 | 
					    ) = ms.MultimodalSearch.load_feature_extractor_model_albef(
 | 
				
			||||||
 | 
					        my_dict, multimodal_device
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
 | 
				
			||||||
 | 
					    test_querry = (
 | 
				
			||||||
 | 
					        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
 | 
				
			||||||
 | 
					    processed_text = txt_processor["eval"](test_querry)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with no_grad():
 | 
				
			||||||
 | 
					        extracted_feature_img = model.extract_features(
 | 
				
			||||||
 | 
					            {"image": processed_pic, "text_input": ""}, mode="image"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        extracted_feature_text = model.extract_features(
 | 
				
			||||||
 | 
					            {"image": "", "text_input": processed_text}, mode="text"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert processed_pic[0, 0, 0, 25:35].tolist() == [
 | 
				
			||||||
 | 
					        -1.0039474964141846,
 | 
				
			||||||
 | 
					        -1.0039474964141846,
 | 
				
			||||||
 | 
					        -0.8433647751808167,
 | 
				
			||||||
 | 
					        -0.6097899675369263,
 | 
				
			||||||
 | 
					        -0.5951915383338928,
 | 
				
			||||||
 | 
					        -0.6243883967399597,
 | 
				
			||||||
 | 
					        -0.6827820539474487,
 | 
				
			||||||
 | 
					        -0.6097899675369263,
 | 
				
			||||||
 | 
					        -0.7119789123535156,
 | 
				
			||||||
 | 
					        -1.0623412132263184,
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert (
 | 
				
			||||||
 | 
					        processed_text
 | 
				
			||||||
 | 
					        == "the bird sat on a tree located at the intersection of 23rd and 43rd streets"
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert extracted_feature_img["image_embeds_proj"][0, 0, 10:20].tolist() == [
 | 
				
			||||||
 | 
					        0.08971136063337326,
 | 
				
			||||||
 | 
					        -0.10915573686361313,
 | 
				
			||||||
 | 
					        -0.020636577159166336,
 | 
				
			||||||
 | 
					        0.048121627420186996,
 | 
				
			||||||
 | 
					        -0.05943416804075241,
 | 
				
			||||||
 | 
					        -0.129856139421463,
 | 
				
			||||||
 | 
					        -0.0034469354432076216,
 | 
				
			||||||
 | 
					        0.017888527363538742,
 | 
				
			||||||
 | 
					        -0.03284582123160362,
 | 
				
			||||||
 | 
					        -0.1037328764796257,
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert extracted_feature_text["text_embeds_proj"][0, 0, 10:20].tolist() == [
 | 
				
			||||||
 | 
					        -0.06229640915989876,
 | 
				
			||||||
 | 
					        0.11278597265481949,
 | 
				
			||||||
 | 
					        0.06628583371639252,
 | 
				
			||||||
 | 
					        0.1649140566587448,
 | 
				
			||||||
 | 
					        0.068987175822258,
 | 
				
			||||||
 | 
					        0.006291372701525688,
 | 
				
			||||||
 | 
					        0.03244050219655037,
 | 
				
			||||||
 | 
					        -0.049556829035282135,
 | 
				
			||||||
 | 
					        0.050752390176057816,
 | 
				
			||||||
 | 
					        -0.0421440489590168,
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    del model, vis_processor, txt_processor
 | 
				
			||||||
 | 
					    cuda.empty_cache()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_load_feature_extractor_model_clip_base():
 | 
				
			||||||
 | 
					    my_dict = {}
 | 
				
			||||||
 | 
					    multimodal_device = device("cuda" if cuda.is_available() else "cpu")
 | 
				
			||||||
 | 
					    (
 | 
				
			||||||
 | 
					        model,
 | 
				
			||||||
 | 
					        vis_processor,
 | 
				
			||||||
 | 
					        txt_processor,
 | 
				
			||||||
 | 
					    ) = ms.MultimodalSearch.load_feature_extractor_model_clip_base(
 | 
				
			||||||
 | 
					        my_dict, multimodal_device
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
 | 
				
			||||||
 | 
					    test_querry = (
 | 
				
			||||||
 | 
					        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
 | 
				
			||||||
 | 
					    processed_text = txt_processor["eval"](test_querry)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with no_grad():
 | 
				
			||||||
 | 
					        extracted_feature_img = model.extract_features({"image": processed_pic})
 | 
				
			||||||
 | 
					        extracted_feature_text = model.extract_features({"text_input": processed_text})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert processed_pic[0, 0, 0, 25:35].tolist() == [
 | 
				
			||||||
 | 
					        -0.7995694875717163,
 | 
				
			||||||
 | 
					        -0.7849710583686829,
 | 
				
			||||||
 | 
					        -0.7849710583686829,
 | 
				
			||||||
 | 
					        -0.7703726291656494,
 | 
				
			||||||
 | 
					        -0.7703726291656494,
 | 
				
			||||||
 | 
					        -0.7849710583686829,
 | 
				
			||||||
 | 
					        -0.7849710583686829,
 | 
				
			||||||
 | 
					        -0.7703726291656494,
 | 
				
			||||||
 | 
					        -0.7703726291656494,
 | 
				
			||||||
 | 
					        -0.7703726291656494,
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert (
 | 
				
			||||||
 | 
					        processed_text
 | 
				
			||||||
 | 
					        == "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert extracted_feature_img[0, 10:20].tolist() == [
 | 
				
			||||||
 | 
					        0.15101124346256256,
 | 
				
			||||||
 | 
					        -0.03759124130010605,
 | 
				
			||||||
 | 
					        -0.40093156695365906,
 | 
				
			||||||
 | 
					        -0.32228705286979675,
 | 
				
			||||||
 | 
					        0.1576370894908905,
 | 
				
			||||||
 | 
					        -0.23340347409248352,
 | 
				
			||||||
 | 
					        -0.3892208933830261,
 | 
				
			||||||
 | 
					        0.20170584321022034,
 | 
				
			||||||
 | 
					        -0.030034437775611877,
 | 
				
			||||||
 | 
					        0.19082790613174438,
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert extracted_feature_text[0, 10:20].tolist() == [
 | 
				
			||||||
 | 
					        0.15391531586647034,
 | 
				
			||||||
 | 
					        0.3078577518463135,
 | 
				
			||||||
 | 
					        0.21737979352474213,
 | 
				
			||||||
 | 
					        0.0775114893913269,
 | 
				
			||||||
 | 
					        -0.3013279139995575,
 | 
				
			||||||
 | 
					        0.2806251049041748,
 | 
				
			||||||
 | 
					        -0.0407320111989975,
 | 
				
			||||||
 | 
					        -0.02664487063884735,
 | 
				
			||||||
 | 
					        -0.1858849972486496,
 | 
				
			||||||
 | 
					        0.20347601175308228,
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    del model, vis_processor, txt_processor
 | 
				
			||||||
 | 
					    cuda.empty_cache()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_load_feature_extractor_model_clip_vitl14():
 | 
				
			||||||
 | 
					    my_dict = {}
 | 
				
			||||||
 | 
					    multimodal_device = device("cuda" if cuda.is_available() else "cpu")
 | 
				
			||||||
 | 
					    (
 | 
				
			||||||
 | 
					        model,
 | 
				
			||||||
 | 
					        vis_processor,
 | 
				
			||||||
 | 
					        txt_processor,
 | 
				
			||||||
 | 
					    ) = ms.MultimodalSearch.load_feature_extractor_model_clip_vitl14(
 | 
				
			||||||
 | 
					        my_dict, multimodal_device
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
 | 
				
			||||||
 | 
					    test_querry = (
 | 
				
			||||||
 | 
					        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
 | 
				
			||||||
 | 
					    processed_text = txt_processor["eval"](test_querry)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with no_grad():
 | 
				
			||||||
 | 
					        extracted_feature_img = model.extract_features({"image": processed_pic})
 | 
				
			||||||
 | 
					        extracted_feature_text = model.extract_features({"text_input": processed_text})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert processed_pic[0, 0, 0, 25:35].tolist() == [
 | 
				
			||||||
 | 
					        -0.7995694875717163,
 | 
				
			||||||
 | 
					        -0.7849710583686829,
 | 
				
			||||||
 | 
					        -0.7849710583686829,
 | 
				
			||||||
 | 
					        -0.7703726291656494,
 | 
				
			||||||
 | 
					        -0.7703726291656494,
 | 
				
			||||||
 | 
					        -0.7849710583686829,
 | 
				
			||||||
 | 
					        -0.7849710583686829,
 | 
				
			||||||
 | 
					        -0.7703726291656494,
 | 
				
			||||||
 | 
					        -0.7703726291656494,
 | 
				
			||||||
 | 
					        -0.7703726291656494,
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert (
 | 
				
			||||||
 | 
					        processed_text
 | 
				
			||||||
 | 
					        == "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert extracted_feature_img[0, 10:20].tolist() == [
 | 
				
			||||||
 | 
					        -0.3911527395248413,
 | 
				
			||||||
 | 
					        -0.35456305742263794,
 | 
				
			||||||
 | 
					        0.5724918842315674,
 | 
				
			||||||
 | 
					        0.3184954524040222,
 | 
				
			||||||
 | 
					        0.23444902896881104,
 | 
				
			||||||
 | 
					        -0.14105141162872314,
 | 
				
			||||||
 | 
					        0.26309096813201904,
 | 
				
			||||||
 | 
					        -0.0559774711728096,
 | 
				
			||||||
 | 
					        0.19491413235664368,
 | 
				
			||||||
 | 
					        0.01419895887374878,
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert extracted_feature_text[0, 10:20].tolist() == [
 | 
				
			||||||
 | 
					        -0.07539052516222,
 | 
				
			||||||
 | 
					        0.0939129889011383,
 | 
				
			||||||
 | 
					        -0.2643853425979614,
 | 
				
			||||||
 | 
					        -0.2459949105978012,
 | 
				
			||||||
 | 
					        0.2387947291135788,
 | 
				
			||||||
 | 
					        -0.5204038023948669,
 | 
				
			||||||
 | 
					        -0.514020562171936,
 | 
				
			||||||
 | 
					        -0.32557412981987,
 | 
				
			||||||
 | 
					        0.18563221395015717,
 | 
				
			||||||
 | 
					        -0.3183072805404663,
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    del model, vis_processor, txt_processor
 | 
				
			||||||
 | 
					    cuda.empty_cache()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_load_feature_extractor_model_clip_vitl14_336():
 | 
				
			||||||
 | 
					    my_dict = {}
 | 
				
			||||||
 | 
					    multimodal_device = device("cuda" if cuda.is_available() else "cpu")
 | 
				
			||||||
 | 
					    (
 | 
				
			||||||
 | 
					        model,
 | 
				
			||||||
 | 
					        vis_processor,
 | 
				
			||||||
 | 
					        txt_processor,
 | 
				
			||||||
 | 
					    ) = ms.MultimodalSearch.load_feature_extractor_model_clip_vitl14_336(
 | 
				
			||||||
 | 
					        my_dict, multimodal_device
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    test_pic = Image.open(TEST_IMAGE_2).convert("RGB")
 | 
				
			||||||
 | 
					    test_querry = (
 | 
				
			||||||
 | 
					        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    processed_pic = vis_processor["eval"](test_pic).unsqueeze(0).to(multimodal_device)
 | 
				
			||||||
 | 
					    processed_text = txt_processor["eval"](test_querry)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with no_grad():
 | 
				
			||||||
 | 
					        extracted_feature_img = model.extract_features({"image": processed_pic})
 | 
				
			||||||
 | 
					        extracted_feature_text = model.extract_features({"text_input": processed_text})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert processed_pic[0, 0, 0, 25:35].tolist() == [
 | 
				
			||||||
 | 
					        -0.7995694875717163,
 | 
				
			||||||
 | 
					        -0.7849710583686829,
 | 
				
			||||||
 | 
					        -0.7849710583686829,
 | 
				
			||||||
 | 
					        -0.7849710583686829,
 | 
				
			||||||
 | 
					        -0.7849710583686829,
 | 
				
			||||||
 | 
					        -0.7849710583686829,
 | 
				
			||||||
 | 
					        -0.7849710583686829,
 | 
				
			||||||
 | 
					        -0.9163569211959839,
 | 
				
			||||||
 | 
					        -1.149931788444519,
 | 
				
			||||||
 | 
					        -1.0039474964141846,
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert (
 | 
				
			||||||
 | 
					        processed_text
 | 
				
			||||||
 | 
					        == "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert extracted_feature_img[0, 10:20].tolist() == [
 | 
				
			||||||
 | 
					        -0.15060146152973175,
 | 
				
			||||||
 | 
					        -0.1998099535703659,
 | 
				
			||||||
 | 
					        0.5503129363059998,
 | 
				
			||||||
 | 
					        0.2589969336986542,
 | 
				
			||||||
 | 
					        -0.0182882659137249,
 | 
				
			||||||
 | 
					        -0.12753525376319885,
 | 
				
			||||||
 | 
					        0.018985718488693237,
 | 
				
			||||||
 | 
					        -0.17110440135002136,
 | 
				
			||||||
 | 
					        0.02220013737678528,
 | 
				
			||||||
 | 
					        0.01086437702178955,
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert extracted_feature_text[0, 10:20].tolist() == [
 | 
				
			||||||
 | 
					        -0.1172553077340126,
 | 
				
			||||||
 | 
					        0.07105237245559692,
 | 
				
			||||||
 | 
					        -0.283934086561203,
 | 
				
			||||||
 | 
					        -0.24353823065757751,
 | 
				
			||||||
 | 
					        0.22662702202796936,
 | 
				
			||||||
 | 
					        -0.472959041595459,
 | 
				
			||||||
 | 
					        -0.5191791653633118,
 | 
				
			||||||
 | 
					        -0.29402273893356323,
 | 
				
			||||||
 | 
					        0.22669515013694763,
 | 
				
			||||||
 | 
					        -0.32044747471809387,
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    del model, vis_processor, txt_processor
 | 
				
			||||||
 | 
					    cuda.empty_cache()
 | 
				
			||||||
 | 
				
			|||||||
		Загрузка…
	
	
			
			x
			
			
		
	
		Ссылка в новой задаче
	
	Block a user