зеркало из
				https://github.com/ssciwr/AMMICO.git
				synced 2025-10-30 21:46:04 +02:00 
			
		
		
		
	 a5c43b6488
			
		
	
	
		a5c43b6488
		
			
		
	
	
	
	
		
			
			* deleted lavis from utils * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed test_objects * added 'not gcv' to CI * fixed multimodal search and summary tests * disable doc build on PR for now * restrict ipywidgets version to avoid dummycomm error * limit deepface version * original repositories for retinaface lavis * update gcv test results * update display test outputs * update test env * run all tests * wo xdist to avoid segfault * remove widgets ref * skip long-running tests * skip long * verbose codecov upload * refactor summary test 2 * finish summary test refactor * reduce memory overhead of SummaryDetector * remove VQA models from self * remove VQA models from self * update notebook for changes * update notebook for changes * fixed multimodal search tests * fixed tests in multimodal search after precommit * run all tests * update doc notebook for summary changes * skip long-running multimodal * exclude blip2 from testing --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Inga Ulusoy <inga.ulusoy@uni-heidelberg.de>
		
			
				
	
	
		
			446 строки
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			446 строки
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pytest
 | |
| import math
 | |
| from PIL import Image
 | |
| import numpy
 | |
| from torch import device, cuda
 | |
| import misinformation.multimodal_search as ms
 | |
| 
 | |
| 
 | |
| testdict = {
 | |
|     "IMG_2746": {"filename": "./test/data/IMG_2746.png"},
 | |
|     "IMG_2809": {"filename": "./test/data/IMG_2809.png"},
 | |
| }
 | |
| 
 | |
| related_error = 1e-2
 | |
| gpu_is_not_available = not cuda.is_available()
 | |
| 
 | |
| 
 | |
| cuda.empty_cache()
 | |
| 
 | |
| 
 | |
| def test_read_img():
 | |
|     my_dict = {}
 | |
|     test_img = ms.MultimodalSearch.read_img(my_dict, testdict["IMG_2746"]["filename"])
 | |
|     assert list(numpy.array(test_img)[257][34]) == [70, 66, 63]
 | |
| 
 | |
| 
 | |
| pre_proc_pic_blip2_blip_albef = [
 | |
|     -1.0039474964141846,
 | |
|     -1.0039474964141846,
 | |
| ]
 | |
| pre_proc_pic_clip_vitl14 = [
 | |
|     -0.7995694875717163,
 | |
|     -0.7849710583686829,
 | |
| ]
 | |
| 
 | |
| pre_proc_pic_clip_vitl14_336 = [
 | |
|     -0.7995694875717163,
 | |
|     -0.7849710583686829,
 | |
| ]
 | |
| 
 | |
| pre_proc_text_blip2_blip_albef = (
 | |
|     "the bird sat on a tree located at the intersection of 23rd and 43rd streets"
 | |
| )
 | |
| 
 | |
| pre_proc_text_clip_clip_vitl14_clip_vitl14_336 = (
 | |
|     "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
 | |
| )
 | |
| 
 | |
| pre_extracted_feature_img_blip2 = [
 | |
|     0.04566730558872223,
 | |
|     -0.042554520070552826,
 | |
| ]
 | |
| 
 | |
| pre_extracted_feature_img_blip = [
 | |
|     -0.02480311505496502,
 | |
|     0.05037587881088257,
 | |
| ]
 | |
| 
 | |
| pre_extracted_feature_img_albef = [
 | |
|     0.08971136063337326,
 | |
|     -0.10915573686361313,
 | |
| ]
 | |
| 
 | |
| pre_extracted_feature_img_clip = [
 | |
|     0.01621132344007492,
 | |
|     -0.004035486374050379,
 | |
| ]
 | |
| 
 | |
| pre_extracted_feature_img_parsing_clip = [
 | |
|     0.01621132344007492,
 | |
|     -0.004035486374050379,
 | |
| ]
 | |
| 
 | |
| pre_extracted_feature_img_clip_vitl14 = [
 | |
|     -0.023943455889821053,
 | |
|     -0.021703708916902542,
 | |
| ]
 | |
| 
 | |
| pre_extracted_feature_img_clip_vitl14_336 = [
 | |
|     -0.009511193260550499,
 | |
|     -0.012618942186236382,
 | |
| ]
 | |
| 
 | |
| pre_extracted_feature_text_blip2 = [
 | |
|     -0.1384204626083374,
 | |
|     -0.008662976324558258,
 | |
| ]
 | |
| 
 | |
| pre_extracted_feature_text_blip = [
 | |
|     0.0118643119931221,
 | |
|     -0.01291718054562807,
 | |
| ]
 | |
| 
 | |
| pre_extracted_feature_text_albef = [
 | |
|     -0.06229640915989876,
 | |
|     0.11278597265481949,
 | |
| ]
 | |
| 
 | |
| pre_extracted_feature_text_clip = [
 | |
|     0.018169036135077477,
 | |
|     0.03634127229452133,
 | |
| ]
 | |
| 
 | |
| pre_extracted_feature_text_clip_vitl14 = [
 | |
|     -0.0055463071912527084,
 | |
|     0.006908962037414312,
 | |
| ]
 | |
| 
 | |
| pre_extracted_feature_text_clip_vitl14_336 = [
 | |
|     -0.008720514364540577,
 | |
|     0.005284308455884457,
 | |
| ]
 | |
| 
 | |
| simularity_blip2 = [
 | |
|     [0.05826476216316223, -0.02717375010251999],
 | |
|     [0.06297147274017334, 0.47339022159576416],
 | |
| ]
 | |
| 
 | |
| sorted_blip2 = [
 | |
|     [1, 0],
 | |
|     [1, 0],
 | |
| ]
 | |
| 
 | |
| simularity_blip = [
 | |
|     [0.15640679001808167, 0.752173662185669],
 | |
|     [0.17233705520629883, 0.8448910117149353],
 | |
| ]
 | |
| 
 | |
| sorted_blip = [
 | |
|     [1, 0],
 | |
|     [1, 0],
 | |
| ]
 | |
| 
 | |
| simularity_albef = [
 | |
|     [0.12321824580430984, 0.35511350631713867],
 | |
|     [0.10870333760976791, 0.5143978595733643],
 | |
| ]
 | |
| 
 | |
| sorted_albef = [
 | |
|     [0, 1],
 | |
|     [1, 0],
 | |
| ]
 | |
| 
 | |
| simularity_clip = [
 | |
|     [0.23923014104366302, 0.5325412750244141],
 | |
|     [0.2310466319322586, 0.5910375714302063],
 | |
| ]
 | |
| 
 | |
| sorted_clip = [
 | |
|     [1, 0],
 | |
|     [1, 0],
 | |
| ]
 | |
| 
 | |
| simularity_clip_vitl14 = [
 | |
|     [0.1051270067691803, 0.5184808373451233],
 | |
|     [0.1277746558189392, 0.6841973662376404],
 | |
| ]
 | |
| 
 | |
| sorted_clip_vitl14 = [
 | |
|     [1, 0],
 | |
|     [1, 0],
 | |
| ]
 | |
| 
 | |
| simularity_clip_vitl14_336 = [
 | |
|     [0.09391091763973236, 0.49337542057037354],
 | |
|     [0.13700757920742035, 0.7003108263015747],
 | |
| ]
 | |
| 
 | |
| sorted_clip_vitl14_336 = [
 | |
|     [1, 0],
 | |
|     [1, 0],
 | |
| ]
 | |
| 
 | |
| dict_itm_scores_for_blib = {
 | |
|     "blip_base": [
 | |
|         0.07107225805521011,
 | |
|         0.004100032616406679,
 | |
|     ],
 | |
|     "blip_large": [
 | |
|         0.07890705019235611,
 | |
|         0.00271016638725996,
 | |
|     ],
 | |
|     "blip2_coco": [
 | |
|         0.0833505243062973,
 | |
|         0.004216152708977461,
 | |
|     ],
 | |
| }
 | |
| 
 | |
| dict_image_gradcam_with_itm_for_blip = {
 | |
|     "blip_base": [123.36285799741745, 132.31662154197693, 53.38280035299249],
 | |
|     "blip_large": [119.99512910842896, 128.7044593691826, 55.552959859540515],
 | |
| }
 | |
| 
 | |
| 
 | |
| @pytest.mark.long
 | |
| @pytest.mark.parametrize(
 | |
|     (
 | |
|         "pre_multimodal_device",
 | |
|         "pre_model",
 | |
|         "pre_proc_pic",
 | |
|         "pre_proc_text",
 | |
|         "pre_extracted_feature_img",
 | |
|         "pre_extracted_feature_text",
 | |
|         "pre_simularity",
 | |
|         "pre_sorted",
 | |
|     ),
 | |
|     [
 | |
|         # (
 | |
|         #     device("cpu"),
 | |
|         #     "blip2",
 | |
|         #     pre_proc_pic_blip2_blip_albef,
 | |
|         #     pre_proc_text_blip2_blip_albef,
 | |
|         #     pre_extracted_feature_img_blip2,
 | |
|         #     pre_extracted_feature_text_blip2,
 | |
|         #     simularity_blip2,
 | |
|         #     sorted_blip2,
 | |
|         # ),
 | |
|         # pytest.param(
 | |
|         #     device("cuda"),
 | |
|         #     "blip2",
 | |
|         #     pre_proc_pic_blip2_blip_albef,
 | |
|         #     pre_proc_text_blip2_blip_albef,
 | |
|         #     pre_extracted_feature_img_blip2,
 | |
|         #     pre_extracted_feature_text_blip2,
 | |
|         #     simularity_blip2,
 | |
|         #     sorted_blip2,
 | |
|         #     marks=pytest.mark.skipif(
 | |
|         #         gpu_is_not_available, reason="gpu_is_not_availible"
 | |
|         #     ),
 | |
|         # ),
 | |
|         (
 | |
|             device("cpu"),
 | |
|             "blip",
 | |
|             pre_proc_pic_blip2_blip_albef,
 | |
|             pre_proc_text_blip2_blip_albef,
 | |
|             pre_extracted_feature_img_blip,
 | |
|             pre_extracted_feature_text_blip,
 | |
|             simularity_blip,
 | |
|             sorted_blip,
 | |
|         ),
 | |
|         pytest.param(
 | |
|             device("cuda"),
 | |
|             "blip",
 | |
|             pre_proc_pic_blip2_blip_albef,
 | |
|             pre_proc_text_blip2_blip_albef,
 | |
|             pre_extracted_feature_img_blip,
 | |
|             pre_extracted_feature_text_blip,
 | |
|             simularity_blip,
 | |
|             sorted_blip,
 | |
|             marks=pytest.mark.skipif(
 | |
|                 gpu_is_not_available, reason="gpu_is_not_availible"
 | |
|             ),
 | |
|         ),
 | |
|         (
 | |
|             device("cpu"),
 | |
|             "albef",
 | |
|             pre_proc_pic_blip2_blip_albef,
 | |
|             pre_proc_text_blip2_blip_albef,
 | |
|             pre_extracted_feature_img_albef,
 | |
|             pre_extracted_feature_text_albef,
 | |
|             simularity_albef,
 | |
|             sorted_albef,
 | |
|         ),
 | |
|         pytest.param(
 | |
|             device("cuda"),
 | |
|             "albef",
 | |
|             pre_proc_pic_blip2_blip_albef,
 | |
|             pre_proc_text_blip2_blip_albef,
 | |
|             pre_extracted_feature_img_albef,
 | |
|             pre_extracted_feature_text_albef,
 | |
|             simularity_albef,
 | |
|             sorted_albef,
 | |
|             marks=pytest.mark.skipif(
 | |
|                 gpu_is_not_available, reason="gpu_is_not_availible"
 | |
|             ),
 | |
|         ),
 | |
|         (
 | |
|             device("cpu"),
 | |
|             "clip_base",
 | |
|             pre_proc_pic_clip_vitl14,
 | |
|             pre_proc_text_clip_clip_vitl14_clip_vitl14_336,
 | |
|             pre_extracted_feature_img_clip,
 | |
|             pre_extracted_feature_text_clip,
 | |
|             simularity_clip,
 | |
|             sorted_clip,
 | |
|         ),
 | |
|         pytest.param(
 | |
|             device("cuda"),
 | |
|             "clip_base",
 | |
|             pre_proc_pic_clip_vitl14,
 | |
|             pre_proc_text_clip_clip_vitl14_clip_vitl14_336,
 | |
|             pre_extracted_feature_img_clip,
 | |
|             pre_extracted_feature_text_clip,
 | |
|             simularity_clip,
 | |
|             sorted_clip,
 | |
|             marks=pytest.mark.skipif(
 | |
|                 gpu_is_not_available, reason="gpu_is_not_availible"
 | |
|             ),
 | |
|         ),
 | |
|         (
 | |
|             device("cpu"),
 | |
|             "clip_vitl14",
 | |
|             pre_proc_pic_clip_vitl14,
 | |
|             pre_proc_text_clip_clip_vitl14_clip_vitl14_336,
 | |
|             pre_extracted_feature_img_clip_vitl14,
 | |
|             pre_extracted_feature_text_clip_vitl14,
 | |
|             simularity_clip_vitl14,
 | |
|             sorted_clip_vitl14,
 | |
|         ),
 | |
|         pytest.param(
 | |
|             device("cuda"),
 | |
|             "clip_vitl14",
 | |
|             pre_proc_pic_clip_vitl14,
 | |
|             pre_proc_text_clip_clip_vitl14_clip_vitl14_336,
 | |
|             pre_extracted_feature_img_clip_vitl14,
 | |
|             pre_extracted_feature_text_clip_vitl14,
 | |
|             simularity_clip_vitl14,
 | |
|             sorted_clip_vitl14,
 | |
|             marks=pytest.mark.skipif(
 | |
|                 gpu_is_not_available, reason="gpu_is_not_availible"
 | |
|             ),
 | |
|         ),
 | |
|         (
 | |
|             device("cpu"),
 | |
|             "clip_vitl14_336",
 | |
|             pre_proc_pic_clip_vitl14_336,
 | |
|             pre_proc_text_clip_clip_vitl14_clip_vitl14_336,
 | |
|             pre_extracted_feature_img_clip_vitl14_336,
 | |
|             pre_extracted_feature_text_clip_vitl14_336,
 | |
|             simularity_clip_vitl14_336,
 | |
|             sorted_clip_vitl14_336,
 | |
|         ),
 | |
|         pytest.param(
 | |
|             device("cuda"),
 | |
|             "clip_vitl14_336",
 | |
|             pre_proc_pic_clip_vitl14_336,
 | |
|             pre_proc_text_clip_clip_vitl14_clip_vitl14_336,
 | |
|             pre_extracted_feature_img_clip_vitl14_336,
 | |
|             pre_extracted_feature_text_clip_vitl14_336,
 | |
|             simularity_clip_vitl14_336,
 | |
|             sorted_clip_vitl14_336,
 | |
|             marks=pytest.mark.skipif(
 | |
|                 gpu_is_not_available, reason="gpu_is_not_availible"
 | |
|             ),
 | |
|         ),
 | |
|     ],
 | |
| )
 | |
| def test_parsing_images(
 | |
|     pre_multimodal_device,
 | |
|     pre_model,
 | |
|     pre_proc_pic,
 | |
|     pre_proc_text,
 | |
|     pre_extracted_feature_img,
 | |
|     pre_extracted_feature_text,
 | |
|     pre_simularity,
 | |
|     pre_sorted,
 | |
| ):
 | |
| 
 | |
|     ms.MultimodalSearch.multimodal_device = pre_multimodal_device
 | |
|     (
 | |
|         model,
 | |
|         vis_processor,
 | |
|         txt_processor,
 | |
|         image_keys,
 | |
|         _,
 | |
|         features_image_stacked,
 | |
|     ) = ms.MultimodalSearch.parsing_images(testdict, pre_model)
 | |
| 
 | |
|     for i, num in zip(range(10), features_image_stacked[0, 10:12].tolist()):
 | |
|         assert (
 | |
|             math.isclose(num, pre_extracted_feature_img[i], rel_tol=related_error)
 | |
|             is True
 | |
|         )
 | |
| 
 | |
|     test_pic = Image.open(testdict["IMG_2746"]["filename"]).convert("RGB")
 | |
|     test_querry = (
 | |
|         "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
 | |
|     )
 | |
|     processed_pic = (
 | |
|         vis_processor["eval"](test_pic).unsqueeze(0).to(pre_multimodal_device)
 | |
|     )
 | |
|     processed_text = txt_processor["eval"](test_querry)
 | |
| 
 | |
|     for i, num in zip(range(10), processed_pic[0, 0, 0, 25:27].tolist()):
 | |
|         assert math.isclose(num, pre_proc_pic[i], rel_tol=related_error) is True
 | |
| 
 | |
|     assert processed_text == pre_proc_text
 | |
| 
 | |
|     search_query = [
 | |
|         {"text_input": test_querry},
 | |
|         {"image": testdict["IMG_2746"]["filename"]},
 | |
|     ]
 | |
|     multi_features_stacked = ms.MultimodalSearch.querys_processing(
 | |
|         testdict, search_query, model, txt_processor, vis_processor, pre_model
 | |
|     )
 | |
| 
 | |
|     for i, num in zip(range(10), multi_features_stacked[0, 10:12].tolist()):
 | |
|         assert (
 | |
|             math.isclose(num, pre_extracted_feature_text[i], rel_tol=related_error)
 | |
|             is True
 | |
|         )
 | |
| 
 | |
|     for i, num in zip(range(10), multi_features_stacked[1, 10:12].tolist()):
 | |
|         assert (
 | |
|             math.isclose(num, pre_extracted_feature_img[i], rel_tol=related_error)
 | |
|             is True
 | |
|         )
 | |
| 
 | |
|     search_query2 = [
 | |
|         {"text_input": "A bus"},
 | |
|         {"image": "../misinformation/test/data/IMG_3758.png"},
 | |
|     ]
 | |
| 
 | |
|     similarity, sorted_list = ms.MultimodalSearch.multimodal_search(
 | |
|         testdict,
 | |
|         model,
 | |
|         vis_processor,
 | |
|         txt_processor,
 | |
|         pre_model,
 | |
|         image_keys,
 | |
|         features_image_stacked,
 | |
|         search_query2,
 | |
|     )
 | |
| 
 | |
|     for i, num in zip(range(len(pre_simularity)), similarity.tolist()):
 | |
|         for j, num2 in zip(range(len(num)), num):
 | |
|             assert (
 | |
|                 math.isclose(num2, pre_simularity[i][j], rel_tol=100 * related_error)
 | |
|                 is True
 | |
|             )
 | |
| 
 | |
|     for i, num in zip(range(len(pre_sorted)), sorted_list):
 | |
|         for j, num2 in zip(range(2), num):
 | |
|             assert num2 == pre_sorted[i][j]
 | |
| 
 | |
|     del (
 | |
|         model,
 | |
|         vis_processor,
 | |
|         txt_processor,
 | |
|         similarity,
 | |
|         features_image_stacked,
 | |
|         processed_pic,
 | |
|         multi_features_stacked,
 | |
|     )
 | |
|     cuda.empty_cache()
 |