diff --git a/misinformation/multimodal_search.py b/misinformation/multimodal_search.py index de0df14..4de65e7 100644 --- a/misinformation/multimodal_search.py +++ b/misinformation/multimodal_search.py @@ -12,6 +12,8 @@ from IPython.display import display from lavis.models import load_model_and_preprocess, load_model, BlipBase from lavis.processors import load_processor +# from memory_profiler import profile + class MultimodalSearch(AnalysisMethod): def __init__(self, subdict: dict) -> None: @@ -339,13 +341,6 @@ class MultimodalSearch(AnalysisMethod): return text_query_index - def itm_images_processing(self, image_paths, vis_processor): - raw_images = [MultimodalSearch.read_img(self, path) for path in image_paths] - images = [vis_processor(r_img) for r_img in raw_images] - images_tensors = torch.stack(images).to(MultimodalSearch.multimodal_device) - - return raw_images, images_tensors - def get_pathes_from_query(self, query): paths = [] image_names = [] @@ -483,6 +478,7 @@ class MultimodalSearch(AnalysisMethod): vis_processor = load_processor("blip_image_eval").build(image_size=384) return itm_model, vis_processor + # @profile def image_text_match_reordering( self, search_query, @@ -602,6 +598,20 @@ class MultimodalSearch(AnalysisMethod): image_gradcam_with_itm[ list(search_query[index_text_query].values())[0] ] = localimage_gradcam_with_itm + del ( + itm_model, + vis_processor_itm, + text_processor, + raw_images, + images, + tokenizer, + queries_batch, + queries_tok_batch, + itm_score, + ) + if need_grad_cam: + del itm_output, gradcam, norm_img, grad_cam, avg_gradcam + torch.cuda.empty_cache() return itm_scores2, image_gradcam_with_itm def show_results(self, query, itm=False, image_gradcam_with_itm=False): diff --git a/misinformation/test/test_multimodal_search.py b/misinformation/test/test_multimodal_search.py index 83aa48f..b52e122 100644 --- a/misinformation/test/test_multimodal_search.py +++ b/misinformation/test/test_multimodal_search.py @@ -6,20 +6,8 @@ from torch import device, cuda import misinformation.multimodal_search as ms testdict = { - "d755771b-225e-432f-802e-fb8dc850fff7": { - "filename": "./test/data/d755771b-225e-432f-802e-fb8dc850fff7.png" - }, "IMG_2746": {"filename": "./test/data/IMG_2746.png"}, - "IMG_2750": {"filename": "./test/data/IMG_2750.png"}, - "IMG_2805": {"filename": "./test/data/IMG_2805.png"}, - "IMG_2806": {"filename": "./test/data/IMG_2806.png"}, - "IMG_2807": {"filename": "./test/data/IMG_2807.png"}, - "IMG_2808": {"filename": "./test/data/IMG_2808.png"}, "IMG_2809": {"filename": "./test/data/IMG_2809.png"}, - "IMG_3755": {"filename": "./test/data/IMG_3755.jpg"}, - "IMG_3756": {"filename": "./test/data/IMG_3756.jpg"}, - "IMG_3757": {"filename": "./test/data/IMG_3757.jpg"}, - "pic1": {"filename": "./test/data/pic1.png"}, } related_error = 1e-2 @@ -38,39 +26,15 @@ def test_read_img(): pre_proc_pic_blip2_blip_albef = [ -1.0039474964141846, -1.0039474964141846, - -0.8433647751808167, - -0.6097899675369263, - -0.5951915383338928, - -0.6243883967399597, - -0.6827820539474487, - -0.6097899675369263, - -0.7119789123535156, - -1.0623412132263184, ] pre_proc_pic_clip_vitl14 = [ -0.7995694875717163, -0.7849710583686829, - -0.7849710583686829, - -0.7703726291656494, - -0.7703726291656494, - -0.7849710583686829, - -0.7849710583686829, - -0.7703726291656494, - -0.7703726291656494, - -0.7703726291656494, ] pre_proc_pic_clip_vitl14_336 = [ -0.7995694875717163, -0.7849710583686829, - -0.7849710583686829, - -0.7849710583686829, - -0.7849710583686829, - -0.7849710583686829, - -0.7849710583686829, - -0.9163569211959839, - -1.149931788444519, - -1.0039474964141846, ] pre_proc_text_blip2_blip_albef = ( @@ -84,340 +48,146 @@ pre_proc_text_clip_clip_vitl14_clip_vitl14_336 = ( pre_extracted_feature_img_blip2 = [ 0.04566730558872223, -0.042554520070552826, - -0.06970272958278656, - -0.009771779179573059, - 0.01446065679192543, - 0.10173682868480682, - 0.007092420011758804, - -0.020045937970280647, - 0.12923966348171234, - 0.006452132016420364, ] pre_extracted_feature_img_blip = [ -0.02480311505496502, 0.05037587881088257, - 0.039517853409051895, - -0.06994109600782394, - -0.12886561453342438, - 0.047039758414030075, - -0.11620642244815826, - -0.003398326924070716, - -0.07324369996786118, - 0.06994668394327164, ] pre_extracted_feature_img_albef = [ 0.08971136063337326, -0.10915573686361313, - -0.020636577159166336, - 0.048121627420186996, - -0.05943416804075241, - -0.129856139421463, - -0.0034469354432076216, - 0.017888527363538742, - -0.03284582123160362, - -0.1037328764796257, ] pre_extracted_feature_img_clip = [ 0.01621132344007492, -0.004035486374050379, - -0.04304071143269539, - -0.03459808602929115, - 0.016922621056437492, - -0.025056276470422745, - -0.04178355261683464, - 0.02165347896516323, - -0.003224249929189682, - 0.020485712215304375, ] pre_extracted_feature_img_parsing_clip = [ 0.01621132344007492, -0.004035486374050379, - -0.04304071143269539, - -0.03459808602929115, - 0.016922621056437492, - -0.025056276470422745, - -0.04178355261683464, - 0.02165347896516323, - -0.003224249929189682, - 0.020485712215304375, ] pre_extracted_feature_img_clip_vitl14 = [ -0.023943455889821053, -0.021703708916902542, - 0.035043686628341675, - 0.019495919346809387, - 0.014351222664117813, - -0.008634116500616074, - 0.01610446907579899, - -0.003426523646339774, - 0.011931191198527813, - 0.0008691544644534588, ] pre_extracted_feature_img_clip_vitl14_336 = [ -0.009511193260550499, -0.012618942186236382, - 0.034754861146211624, - 0.016356879845261574, - -0.0011549904011189938, - -0.008054453879594803, - 0.0011990377679467201, - -0.010806051082909107, - 0.00140204350464046, - 0.0006861367146484554, ] pre_extracted_feature_text_blip2 = [ -0.1384204626083374, -0.008662976324558258, - 0.006269007455557585, - 0.03151319921016693, - 0.060558050870895386, - -0.03230040520429611, - 0.015861615538597107, - -0.11856459826231003, - -0.058296192437410355, - 0.03699290752410889, ] pre_extracted_feature_text_blip = [ 0.0118643119931221, -0.01291718054562807, - -0.0009687161073088646, - 0.01428765058517456, - -0.05591396614909172, - 0.07386433333158493, - -0.11475936323404312, - 0.01620068959891796, - 0.0062415082938969135, - 0.0034833091776818037, ] pre_extracted_feature_text_albef = [ -0.06229640915989876, 0.11278597265481949, - 0.06628583371639252, - 0.1649140566587448, - 0.068987175822258, - 0.006291372701525688, - 0.03244050219655037, - -0.049556829035282135, - 0.050752390176057816, - -0.0421440489590168, ] pre_extracted_feature_text_clip = [ 0.018169036135077477, 0.03634127229452133, - 0.025660742074251175, - 0.009149895049631596, - -0.035570453852415085, - 0.033126577734947205, - -0.004808237310498953, - -0.0031453112605959177, - -0.02194291725754738, - 0.024019461125135422, ] pre_extracted_feature_text_clip_vitl14 = [ -0.0055463071912527084, 0.006908962037414312, - -0.019450219348073006, - -0.018097277730703354, - 0.017567576840519905, - -0.03828490898013115, - -0.03781530633568764, - -0.023951737210154533, - 0.01365653332322836, - -0.02341713197529316, ] pre_extracted_feature_text_clip_vitl14_336 = [ -0.008720514364540577, 0.005284308455884457, - -0.021116750314831734, - -0.018112430348992348, - 0.01685470901429653, - -0.03517491742968559, - -0.038612402975559235, - -0.021867064759135246, - 0.01685977540910244, - -0.023832324892282486, ] simularity_blip2 = [ [0.05826476216316223, -0.02717375010251999], - [0.12869958579540253, 0.006344856694340706], - [0.11073512583971024, 0.12327021360397339], - [0.08743024617433548, 0.058944884687662125], - [0.04591086134314537, 0.4905201494693756], [0.06297147274017334, 0.47339022159576416], - [0.18486255407333374, 0.6350338459014893], - [0.015455856919288635, 0.018462061882019043], - [-0.008606988936662674, 0.00741103570908308], - [-0.0415784977376461, -0.1267213076353073], - [-0.025470387190580368, 0.1315656304359436], - [-0.05090826004743576, 0.059172093868255615], ] sorted_blip2 = [ - [6, 1, 2, 3, 5, 0, 4, 7, 8, 10, 9, 11], - [6, 4, 5, 10, 2, 11, 3, 7, 8, 1, 0, 9], + [1, 0], + [1, 0], ] simularity_blip = [ [0.15640679001808167, 0.752173662185669], - [0.15139800310134888, 0.7804810404777527], - [0.13010388612747192, 0.755257248878479], - [0.13746635615825653, 0.7618774175643921], - [0.1756758838891983, 0.8531903624534607], [0.17233705520629883, 0.8448910117149353], - [0.1970970332622528, 0.8916105628013611], - [0.11693969368934631, 0.5833531618118286], - [0.12386563420295715, 0.5981853604316711], - [0.08427951484918594, 0.4962371587753296], - [0.14193706214427948, 0.7613846659660339], - [0.12051936239004135, 0.6492202281951904], ] sorted_blip = [ - [6, 4, 5, 0, 1, 10, 3, 2, 8, 11, 7, 9], - [6, 4, 5, 1, 3, 10, 2, 0, 11, 8, 7, 9], + [1, 0], + [1, 0], ] simularity_albef = [ [0.12321824580430984, 0.35511350631713867], - [0.09512615948915482, 0.27168408036231995], - [0.09053325653076172, 0.20215675234794617], - [0.06335515528917313, 0.15055638551712036], - [0.09604836255311966, 0.4658776521682739], [0.10870333760976791, 0.5143978595733643], - [0.11748822033405304, 0.6542638540267944], - [0.05688793584704399, 0.22170542180538177], - [0.05597608536481857, 0.11963296681642532], - [0.059643782675266266, 0.14969395101070404], - [0.06690303236246109, 0.3149859607219696], - [0.07909377664327621, 0.11911341547966003], ] sorted_albef = [ - [0, 6, 5, 4, 1, 2, 11, 10, 3, 9, 7, 8], - [6, 5, 4, 0, 10, 1, 7, 2, 3, 9, 8, 11], + [0, 1], + [1, 0], ] simularity_clip = [ [0.23923014104366302, 0.5325412750244141], - [0.20101115107536316, 0.5112978219985962], - [0.17522737383842468, 0.49811851978302], - [0.20062290132045746, 0.5415266156196594], - [0.22865726053714752, 0.5762109756469727], [0.2310466319322586, 0.5910375714302063], - [0.2644523084163666, 0.7851459383964539], - [0.21474510431289673, 0.4135811924934387], - [0.16407863795757294, 0.1474374681711197], - [0.19819433987140656, 0.26493316888809204], - [0.19545596837997437, 0.5007457137107849], - [0.1647854745388031, 0.45705708861351013], ] sorted_clip = [ - [6, 0, 5, 4, 7, 1, 3, 9, 10, 2, 11, 8], - [6, 5, 4, 3, 0, 1, 10, 2, 11, 7, 9, 8], + [1, 0], + [1, 0], ] simularity_clip_vitl14 = [ [0.1051270067691803, 0.5184808373451233], - [0.09705893695354462, 0.49574509263038635], - [0.11964304000139236, 0.5424358248710632], - [0.13881900906562805, 0.5909714698791504], - [0.12728188931941986, 0.6758255362510681], [0.1277746558189392, 0.6841973662376404], - [0.18026694655418396, 0.803142786026001], - [0.13977059721946716, 0.45957139134407043], - [0.11180847883224487, 0.24822194874286652], - [0.12296056002378464, 0.35143694281578064], - [0.11596094071865082, 0.5704031586647034], - [0.10174489766359329, 0.44422751665115356], ] sorted_clip_vitl14 = [ - [6, 7, 3, 5, 4, 9, 2, 10, 8, 0, 11, 1], - [6, 5, 4, 3, 10, 2, 0, 1, 7, 11, 9, 8], + [1, 0], + [1, 0], ] simularity_clip_vitl14_336 = [ [0.09391091763973236, 0.49337542057037354], - [0.11103834211826324, 0.4881117343902588], - [0.12891019880771637, 0.5501476526260376], - [0.13288410007953644, 0.5498673915863037], - [0.12357455492019653, 0.6749162077903748], [0.13700757920742035, 0.7003108263015747], - [0.1788637489080429, 0.7713702321052551], - [0.13260436058044434, 0.4300197660923004], - [0.11666625738143921, 0.2334875613451004], - [0.1316065937280655, 0.3291645646095276], - [0.12374477833509445, 0.5632147192955017], - [0.10333051532506943, 0.43023794889450073], ] sorted_clip_vitl14_336 = [ - [6, 5, 3, 7, 9, 2, 10, 4, 8, 1, 11, 0], - [6, 5, 4, 10, 2, 3, 0, 1, 11, 7, 9, 8], + [1, 0], + [1, 0], ] dict_itm_scores_for_blib = { "blip_base": [ 0.07107225805521011, - 0.02078203856945038, - 0.02083236537873745, - 0.0042252070270478725, - 0.0002070252230623737, 0.004100032616406679, - 0.0009893750539049506, - 0.00015318625082727522, - 1.9936736862291582e-05, - 4.0084025386022404e-05, - 0.0006117739249020815, - 4.1486648115096614e-05, ], "blip_large": [ 0.07890705019235611, - 0.04954551160335541, - 0.05564938113093376, - 0.002710158471018076, - 0.0026644798927009106, - 0.01277624536305666, - 0.003585426602512598, - 0.0019450040999799967, - 0.0036240608897060156, - 0.0013280785642564297, - 0.015366943553090096, - 0.0030039174016565084, + 0.00271016638725996, ], "blip2_coco": [ 0.0833505243062973, - 0.046232130378484726, - 0.04996354877948761, - 0.004187352955341339, - 2.5233526685042307e-05, - 0.002679687924683094, - 2.4826533262967132e-05, - 5.1878203521482646e-05, - 1.3434584616334178e-05, - 9.76747560343938e-06, - 7.34204331820365e-06, - 1.1423194337112363e-05, + 0.004216152708977461, ], } dict_image_gradcam_with_itm_for_blip = { - "blip_base": [125.12124404, 132.07243145, 65.43589668], - "blip_large": [118.75610679, 125.35366997, 69.63849807], + "blip_base": [123.36285799741745, 132.31662154197693, 53.38280035299249], + "blip_large": [119.99512910842896, 128.7044593691826, 55.552959859540515], } @@ -594,7 +364,7 @@ def test_parsing_images( features_image_stacked, ) = ms.MultimodalSearch.parsing_images(testdict, pre_model) - for i, num in zip(range(10), features_image_stacked[0, 10:20].tolist()): + for i, num in zip(range(10), features_image_stacked[0, 10:12].tolist()): assert ( math.isclose(num, pre_extracted_feature_img[i], rel_tol=related_error) is True @@ -609,7 +379,7 @@ def test_parsing_images( ) processed_text = txt_processor["eval"](test_querry) - for i, num in zip(range(10), processed_pic[0, 0, 0, 25:35].tolist()): + for i, num in zip(range(10), processed_pic[0, 0, 0, 25:27].tolist()): assert math.isclose(num, pre_proc_pic[i], rel_tol=related_error) is True assert processed_text == pre_proc_text @@ -622,13 +392,13 @@ def test_parsing_images( testdict, search_query, model, txt_processor, vis_processor, pre_model ) - for i, num in zip(range(10), multi_features_stacked[0, 10:20].tolist()): + for i, num in zip(range(10), multi_features_stacked[0, 10:12].tolist()): assert ( math.isclose(num, pre_extracted_feature_text[i], rel_tol=related_error) is True ) - for i, num in zip(range(10), multi_features_stacked[1, 10:20].tolist()): + for i, num in zip(range(10), multi_features_stacked[1, 10:12].tolist()): assert ( math.isclose(num, pre_extracted_feature_img[i], rel_tol=related_error) is True @@ -650,39 +420,132 @@ def test_parsing_images( search_query2, ) - for i, num in zip(range(12), similarity.tolist()): + for i, num in zip(range(len(pre_simularity)), similarity.tolist()): for j, num2 in zip(range(len(num)), num): assert ( math.isclose(num2, pre_simularity[i][j], rel_tol=100 * related_error) is True ) - for i, num in zip(range(2), sorted_list): + for i, num in zip(range(len(pre_sorted)), sorted_list): for j, num2 in zip(range(2), num): assert num2 == pre_sorted[i][j] - del model, vis_processor, txt_processor + del ( + model, + vis_processor, + txt_processor, + similarity, + features_image_stacked, + processed_pic, + multi_features_stacked, + ) cuda.empty_cache() - if pre_model == "blip": - for itm_model in ["blip_base", "blip_large", "blip2_coco"]: - ( - itm_scores, - image_gradcam_with_itm, - ) = ms.MultimodalSearch.image_text_match_reordering( - testdict, - search_query2, - itm_model, - image_keys, - sorted_list, - batch_size=1, - need_grad_cam=False, + +def test_itm(): + test_my_dict = { + "IMG_2746": { + "filename": "../misinformation/test/data/IMG_2746.png", + "rank A bus": 1, + "A bus": 0.15640679001808167, + "rank ../misinformation/test/data/IMG_3758.png": 1, + "../misinformation/test/data/IMG_3758.png": 0.7533495426177979, + }, + "IMG_2809": { + "filename": "../misinformation/test/data/IMG_2809.png", + "rank A bus": 0, + "A bus": 0.1970970332622528, + "rank ../misinformation/test/data/IMG_3758.png": 0, + "../misinformation/test/data/IMG_3758.png": 0.8907483816146851, + }, + } + search_query3 = [ + {"text_input": "A bus"}, + {"image": "../misinformation/test/data/IMG_3758.png"}, + ] + image_keys = ["IMG_2746", "IMG_2809"] + sorted_list = [[1, 0], [1, 0]] + for itm_model in ["blip_base", "blip_large"]: + ( + itm_scores, + image_gradcam_with_itm, + ) = ms.MultimodalSearch.image_text_match_reordering( + test_my_dict, + search_query3, + itm_model, + image_keys, + sorted_list, + batch_size=1, + need_grad_cam=True, + ) + for i, itm in zip( + range(len(dict_itm_scores_for_blib[itm_model])), + dict_itm_scores_for_blib[itm_model], + ): + assert ( + math.isclose(itm_scores[0].tolist()[i], itm, rel_tol=10 * related_error) + is True ) - for i, itm in zip( - range(len(dict_itm_scores_for_blib[itm_model])), - dict_itm_scores_for_blib[itm_model], - ): - assert ( - math.isclose(itm_scores[0].tolist()[i], itm, rel_tol=related_error) - is True + for i, grad_cam in zip( + range(len(dict_image_gradcam_with_itm_for_blip[itm_model])), + dict_image_gradcam_with_itm_for_blip[itm_model], + ): + assert ( + math.isclose( + image_gradcam_with_itm["A bus"]["IMG_2809"][0][0].tolist()[i], + grad_cam, + rel_tol=10 * related_error, ) + is True + ) + del itm_scores, image_gradcam_with_itm + cuda.empty_cache() + + +def test_itm_blip2_coco(): + test_my_dict = { + "IMG_2746": { + "filename": "../misinformation/test/data/IMG_2746.png", + "rank A bus": 1, + "A bus": 0.15640679001808167, + "rank ../misinformation/test/data/IMG_3758.png": 1, + "../misinformation/test/data/IMG_3758.png": 0.7533495426177979, + }, + "IMG_2809": { + "filename": "../misinformation/test/data/IMG_2809.png", + "rank A bus": 0, + "A bus": 0.1970970332622528, + "rank ../misinformation/test/data/IMG_3758.png": 0, + "../misinformation/test/data/IMG_3758.png": 0.8907483816146851, + }, + } + search_query3 = [ + {"text_input": "A bus"}, + {"image": "../misinformation/test/data/IMG_3758.png"}, + ] + image_keys = ["IMG_2746", "IMG_2809"] + sorted_list = [[1, 0], [1, 0]] + + ( + itm_scores, + image_gradcam_with_itm, + ) = ms.MultimodalSearch.image_text_match_reordering( + test_my_dict, + search_query3, + "blip2_coco", + image_keys, + sorted_list, + batch_size=1, + need_grad_cam=False, + ) + for i, itm in zip( + range(len(dict_itm_scores_for_blib["blip2_coco"])), + dict_itm_scores_for_blib["blip2_coco"], + ): + assert ( + math.isclose(itm_scores[0].tolist()[i], itm, rel_tol=10 * related_error) + is True + ) + del itm_scores, image_gradcam_with_itm + cuda.empty_cache()