зеркало из
https://github.com/ssciwr/AMMICO.git
synced 2025-10-30 13:36:04 +02:00
997 строки
51 KiB
Plaintext
997 строки
51 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"id": "22df2297-0629-45aa-b88c-6c61f1544db6",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Image Multimodal Search"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"id": "9eeeb302-296e-48dc-86c7-254aa02f2b3a",
|
|
"metadata": {},
|
|
"source": [
|
|
"This notebooks shows how to carry out an image multimodal search with the [LAVIS](https://github.com/salesforce/LAVIS) library. \n",
|
|
"\n",
|
|
"The first cell is only run on google colab and installs the [ammico](https://github.com/ssciwr/AMMICO) package.\n",
|
|
"\n",
|
|
"After that, we can import `ammico` and read in the files given a folder path."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "0b0a6bdf",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:16:16.066219Z",
|
|
"iopub.status.busy": "2023-08-16T08:16:16.065692Z",
|
|
"iopub.status.idle": "2023-08-16T08:16:16.076278Z",
|
|
"shell.execute_reply": "2023-08-16T08:16:16.075551Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# if running on google colab\n",
|
|
"# flake8-noqa-cell\n",
|
|
"import os\n",
|
|
"\n",
|
|
"if \"google.colab\" in str(get_ipython()):\n",
|
|
" # update python version\n",
|
|
" # install setuptools\n",
|
|
" # %pip install setuptools==61 -qqq\n",
|
|
" # install ammico\n",
|
|
" %pip install git+https://github.com/ssciwr/ammico.git -qqq\n",
|
|
" # mount google drive for data and API key\n",
|
|
" from google.colab import drive\n",
|
|
"\n",
|
|
" drive.mount(\"/content/drive\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "f10ad6c9-b1a0-4043-8c5d-ed660d77be37",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:16:16.080294Z",
|
|
"iopub.status.busy": "2023-08-16T08:16:16.079785Z",
|
|
"iopub.status.idle": "2023-08-16T08:16:34.303721Z",
|
|
"shell.execute_reply": "2023-08-16T08:16:34.302694Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import ammico.utils as mutils\n",
|
|
"import ammico.multimodal_search as ms"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "8d3fe589-ff3c-4575-b8f5-650db85596bc",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:16:34.308982Z",
|
|
"iopub.status.busy": "2023-08-16T08:16:34.308130Z",
|
|
"iopub.status.idle": "2023-08-16T08:16:34.313865Z",
|
|
"shell.execute_reply": "2023-08-16T08:16:34.313086Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"images = mutils.find_files(\n",
|
|
" path=\"data/\",\n",
|
|
" limit=10,\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "a08bd3a9-e954-4a0e-ad64-6817abd3a25a",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:16:34.317413Z",
|
|
"iopub.status.busy": "2023-08-16T08:16:34.316889Z",
|
|
"iopub.status.idle": "2023-08-16T08:16:34.324585Z",
|
|
"shell.execute_reply": "2023-08-16T08:16:34.323776Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"['data/102730_eng.png', 'data/102141_2_eng.png', 'data/106349S_por.png']"
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"images"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "adf3db21-1f8b-4d44-bbef-ef0acf4623a0",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:16:34.329442Z",
|
|
"iopub.status.busy": "2023-08-16T08:16:34.328991Z",
|
|
"iopub.status.idle": "2023-08-16T08:16:34.332751Z",
|
|
"shell.execute_reply": "2023-08-16T08:16:34.331965Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"mydict = mutils.initialize_dict(images)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "4c091f95-07cf-42c3-82c8-5f3a3c5929f8",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:16:34.336236Z",
|
|
"iopub.status.busy": "2023-08-16T08:16:34.335970Z",
|
|
"iopub.status.idle": "2023-08-16T08:16:34.340398Z",
|
|
"shell.execute_reply": "2023-08-16T08:16:34.339643Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'102730_eng': {'filename': 'data/102730_eng.png'},\n",
|
|
" '102141_2_eng': {'filename': 'data/102141_2_eng.png'},\n",
|
|
" '106349S_por': {'filename': 'data/106349S_por.png'}}"
|
|
]
|
|
},
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"mydict"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"id": "987540a8-d800-4c70-a76b-7bfabaf123fa",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Indexing and extracting features from images in selected folder"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"id": "66d6ede4-00bc-4aeb-9a36-e52d7de33fe5",
|
|
"metadata": {},
|
|
"source": [
|
|
"First you need to select a model. You can choose one of the following models: \n",
|
|
"- [blip](https://github.com/salesforce/BLIP)\n",
|
|
"- [blip2](https://huggingface.co/docs/transformers/main/model_doc/blip-2) \n",
|
|
"- [albef](https://github.com/salesforce/ALBEF) \n",
|
|
"- [clip_base](https://github.com/openai/CLIP/blob/main/model-card.md)\n",
|
|
"- [clip_vitl14](https://github.com/mlfoundations/open_clip) \n",
|
|
"- [clip_vitl14_336](https://github.com/mlfoundations/open_clip)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "7bbca1f0-d4b0-43cd-8e05-ee39d37c328e",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:16:34.344803Z",
|
|
"iopub.status.busy": "2023-08-16T08:16:34.344383Z",
|
|
"iopub.status.idle": "2023-08-16T08:16:34.348012Z",
|
|
"shell.execute_reply": "2023-08-16T08:16:34.347199Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"model_type = \"blip\"\n",
|
|
"# model_type = \"blip2\"\n",
|
|
"# model_type = \"albef\"\n",
|
|
"# model_type = \"clip_base\"\n",
|
|
"# model_type = \"clip_vitl14\"\n",
|
|
"# model_type = \"clip_vitl14_336\""
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"id": "357828c9",
|
|
"metadata": {},
|
|
"source": [
|
|
"To process the loaded images using the selected model, use the below code:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "f6f2c9b1-4a91-47cb-86b5-2c9c67e4837b",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:16:34.351771Z",
|
|
"iopub.status.busy": "2023-08-16T08:16:34.351355Z",
|
|
"iopub.status.idle": "2023-08-16T08:16:34.354942Z",
|
|
"shell.execute_reply": "2023-08-16T08:16:34.354136Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"my_obj = ms.MultimodalSearch(mydict)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "16603ded-078e-4362-847b-57ad76829327",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:16:34.358472Z",
|
|
"iopub.status.busy": "2023-08-16T08:16:34.358210Z",
|
|
"iopub.status.idle": "2023-08-16T08:16:34.363109Z",
|
|
"shell.execute_reply": "2023-08-16T08:16:34.362325Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'102730_eng': {'filename': 'data/102730_eng.png'},\n",
|
|
" '102141_2_eng': {'filename': 'data/102141_2_eng.png'},\n",
|
|
" '106349S_por': {'filename': 'data/106349S_por.png'}}"
|
|
]
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"my_obj.subdict"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "ca095404-57d0-4f5d-aeb0-38c232252b17",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:16:34.367613Z",
|
|
"iopub.status.busy": "2023-08-16T08:16:34.367023Z",
|
|
"iopub.status.idle": "2023-08-16T08:17:20.367654Z",
|
|
"shell.execute_reply": "2023-08-16T08:17:20.365995Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"'(ReadTimeoutError(\"HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)\"), '(Request ID: 7f5b6643-6ee8-496d-b43e-b7fd632e1ef4)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"'(ReadTimeoutError(\"HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)\"), '(Request ID: b5704c91-0fda-4cfe-a97e-684766d1c312)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"'(ReadTimeoutError(\"HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)\"), '(Request ID: be664022-04b6-4d24-977e-58eb52652c76)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"'(ReadTimeoutError(\"HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)\"), '(Request ID: 950380a7-93f7-45c9-8510-0769a1e04299)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json\n"
|
|
]
|
|
},
|
|
{
|
|
"ename": "OSError",
|
|
"evalue": "Can't load tokenizer for 'bert-base-uncased'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'bert-base-uncased' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer.",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[10], line 8\u001b[0m\n\u001b[1;32m 1\u001b[0m (\n\u001b[1;32m 2\u001b[0m model,\n\u001b[1;32m 3\u001b[0m vis_processors,\n\u001b[1;32m 4\u001b[0m txt_processors,\n\u001b[1;32m 5\u001b[0m image_keys,\n\u001b[1;32m 6\u001b[0m image_names,\n\u001b[1;32m 7\u001b[0m features_image_stacked,\n\u001b[0;32m----> 8\u001b[0m ) \u001b[38;5;241m=\u001b[39m \u001b[43mmy_obj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparsing_images\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath_to_save_tensors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdata/\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
|
|
"File \u001b[0;32m~/work/AMMICO/AMMICO/ammico/multimodal_search.py:363\u001b[0m, in \u001b[0;36mMultimodalSearch.parsing_images\u001b[0;34m(self, model_type, path_to_save_tensors, path_to_load_tensors)\u001b[0m\n\u001b[1;32m 349\u001b[0m select_extract_image_features \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 350\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mblip2\u001b[39m\u001b[38;5;124m\"\u001b[39m: MultimodalSearch\u001b[38;5;241m.\u001b[39mextract_image_features_blip2,\n\u001b[1;32m 351\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mblip\u001b[39m\u001b[38;5;124m\"\u001b[39m: MultimodalSearch\u001b[38;5;241m.\u001b[39mextract_image_features_basic,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 355\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mclip_vitl14_336\u001b[39m\u001b[38;5;124m\"\u001b[39m: MultimodalSearch\u001b[38;5;241m.\u001b[39mextract_image_features_clip,\n\u001b[1;32m 356\u001b[0m }\n\u001b[1;32m 358\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m model_type \u001b[38;5;129;01min\u001b[39;00m select_model\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[1;32m 359\u001b[0m (\n\u001b[1;32m 360\u001b[0m model,\n\u001b[1;32m 361\u001b[0m vis_processors,\n\u001b[1;32m 362\u001b[0m txt_processors,\n\u001b[0;32m--> 363\u001b[0m ) \u001b[38;5;241m=\u001b[39m \u001b[43mselect_model\u001b[49m\u001b[43m[\u001b[49m\n\u001b[1;32m 364\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_type\u001b[49m\n\u001b[1;32m 365\u001b[0m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mMultimodalSearch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmultimodal_device\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 366\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 367\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mSyntaxError\u001b[39;00m(\n\u001b[1;32m 368\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease, use one of the following models: blip2, blip, albef, clip_base, clip_vitl14, clip_vitl14_336\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 369\u001b[0m )\n",
|
|
"File \u001b[0;32m~/work/AMMICO/AMMICO/ammico/multimodal_search.py:55\u001b[0m, in \u001b[0;36mMultimodalSearch.load_feature_extractor_model_blip\u001b[0;34m(self, device)\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_feature_extractor_model_blip\u001b[39m(\u001b[38;5;28mself\u001b[39m, device: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcpu\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 44\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 45\u001b[0m \u001b[38;5;124;03m Load base blip_feature_extractor model and preprocessors for visual and text inputs from lavis.models.\u001b[39;00m\n\u001b[1;32m 46\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;124;03m txt_processors (dict): preprocessors for text inputs.\u001b[39;00m\n\u001b[1;32m 54\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 55\u001b[0m model, vis_processors, txt_processors \u001b[38;5;241m=\u001b[39m \u001b[43mload_model_and_preprocess\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 56\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mblip_feature_extractor\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 57\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mbase\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 58\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 59\u001b[0m \u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 60\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m model, vis_processors, txt_processors\n",
|
|
"File \u001b[0;32m/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/lavis/models/__init__.py:195\u001b[0m, in \u001b[0;36mload_model_and_preprocess\u001b[0;34m(name, model_type, is_eval, device)\u001b[0m\n\u001b[1;32m 192\u001b[0m model_cls \u001b[38;5;241m=\u001b[39m registry\u001b[38;5;241m.\u001b[39mget_model_class(name)\n\u001b[1;32m 194\u001b[0m \u001b[38;5;66;03m# load model\u001b[39;00m\n\u001b[0;32m--> 195\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mmodel_cls\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 197\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_eval:\n\u001b[1;32m 198\u001b[0m model\u001b[38;5;241m.\u001b[39meval()\n",
|
|
"File \u001b[0;32m/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/lavis/models/base_model.py:70\u001b[0m, in \u001b[0;36mBaseModel.from_pretrained\u001b[0;34m(cls, model_type)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 61\u001b[0m \u001b[38;5;124;03mBuild a pretrained model from default configuration file, specified by model_type.\u001b[39;00m\n\u001b[1;32m 62\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[38;5;124;03m - model (nn.Module): pretrained or finetuned model, depending on the configuration.\u001b[39;00m\n\u001b[1;32m 68\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 69\u001b[0m model_cfg \u001b[38;5;241m=\u001b[39m OmegaConf\u001b[38;5;241m.\u001b[39mload(\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_config_path(model_type))\u001b[38;5;241m.\u001b[39mmodel\n\u001b[0;32m---> 70\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_config\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_cfg\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 72\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m model\n",
|
|
"File \u001b[0;32m/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/lavis/models/blip_models/blip_feature_extractor.py:198\u001b[0m, in \u001b[0;36mBlipFeatureExtractor.from_config\u001b[0;34m(cls, cfg)\u001b[0m\n\u001b[1;32m 195\u001b[0m embed_dim \u001b[38;5;241m=\u001b[39m cfg\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124membed_dim\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m256\u001b[39m)\n\u001b[1;32m 196\u001b[0m max_txt_len \u001b[38;5;241m=\u001b[39m cfg\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmax_txt_len\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m30\u001b[39m)\n\u001b[0;32m--> 198\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 199\u001b[0m \u001b[43m \u001b[49m\u001b[43mimage_encoder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mimage_encoder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 200\u001b[0m \u001b[43m \u001b[49m\u001b[43mtext_encoder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtext_encoder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 201\u001b[0m \u001b[43m \u001b[49m\u001b[43membed_dim\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43membed_dim\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 202\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_txt_len\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_txt_len\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 203\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 205\u001b[0m \u001b[38;5;66;03m# load pre-trained weights\u001b[39;00m\n\u001b[1;32m 206\u001b[0m pretrain_path \u001b[38;5;241m=\u001b[39m cfg\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpretrained\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n",
|
|
"File \u001b[0;32m/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/lavis/models/blip_models/blip_feature_extractor.py:41\u001b[0m, in \u001b[0;36mBlipFeatureExtractor.__init__\u001b[0;34m(self, image_encoder, text_encoder, embed_dim, max_txt_len)\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, image_encoder, text_encoder, embed_dim, max_txt_len\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m40\u001b[39m):\n\u001b[1;32m 39\u001b[0m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m()\n\u001b[0;32m---> 41\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtokenizer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minit_tokenizer\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 43\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvisual_encoder \u001b[38;5;241m=\u001b[39m image_encoder\n\u001b[1;32m 44\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtext_encoder \u001b[38;5;241m=\u001b[39m text_encoder\n",
|
|
"File \u001b[0;32m/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/lavis/models/blip_models/blip.py:22\u001b[0m, in \u001b[0;36mBlipBase.init_tokenizer\u001b[0;34m(cls)\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minit_tokenizer\u001b[39m(\u001b[38;5;28mcls\u001b[39m):\n\u001b[0;32m---> 22\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mBertTokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mbert-base-uncased\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 23\u001b[0m tokenizer\u001b[38;5;241m.\u001b[39madd_special_tokens({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbos_token\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m[DEC]\u001b[39m\u001b[38;5;124m\"\u001b[39m})\n\u001b[1;32m 24\u001b[0m tokenizer\u001b[38;5;241m.\u001b[39madd_special_tokens({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124madditional_special_tokens\u001b[39m\u001b[38;5;124m\"\u001b[39m: [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m[ENC]\u001b[39m\u001b[38;5;124m\"\u001b[39m]})\n",
|
|
"File \u001b[0;32m/opt/hostedtoolcache/Python/3.9.17/x64/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:1788\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)\u001b[0m\n\u001b[1;32m 1782\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\n\u001b[1;32m 1783\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCan\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt load following files from cache: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00munresolved_files\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m and cannot check if these \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1784\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfiles are necessary for the tokenizer to operate.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1785\u001b[0m )\n\u001b[1;32m 1787\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mall\u001b[39m(full_file_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01mfor\u001b[39;00m full_file_name \u001b[38;5;129;01min\u001b[39;00m resolved_vocab_files\u001b[38;5;241m.\u001b[39mvalues()):\n\u001b[0;32m-> 1788\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mEnvironmentError\u001b[39;00m(\n\u001b[1;32m 1789\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCan\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt load tokenizer for \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpretrained_model_name_or_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m. If you were trying to load it from \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1790\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhttps://huggingface.co/models\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, make sure you don\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt have a local directory with the same name. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1791\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOtherwise, make sure \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpretrained_model_name_or_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m is the correct path to a directory \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1792\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcontaining all relevant files for a \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m tokenizer.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1793\u001b[0m )\n\u001b[1;32m 1795\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m file_id, file_path \u001b[38;5;129;01min\u001b[39;00m vocab_files\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m 1796\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file_id \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m resolved_vocab_files:\n",
|
|
"\u001b[0;31mOSError\u001b[0m: Can't load tokenizer for 'bert-base-uncased'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'bert-base-uncased' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer."
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"(\n",
|
|
" model,\n",
|
|
" vis_processors,\n",
|
|
" txt_processors,\n",
|
|
" image_keys,\n",
|
|
" image_names,\n",
|
|
" features_image_stacked,\n",
|
|
") = my_obj.parsing_images(\n",
|
|
" model_type, \n",
|
|
" path_to_save_tensors=\"data/\",\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "f236c3b1-c3a6-471a-9fc5-ef831b675286",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:17:20.372739Z",
|
|
"iopub.status.busy": "2023-08-16T08:17:20.371874Z",
|
|
"iopub.status.idle": "2023-08-16T08:17:20.412580Z",
|
|
"shell.execute_reply": "2023-08-16T08:17:20.411656Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"ename": "NameError",
|
|
"evalue": "name 'features_image_stacked' is not defined",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mfeatures_image_stacked\u001b[49m\n",
|
|
"\u001b[0;31mNameError\u001b[0m: name 'features_image_stacked' is not defined"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"features_image_stacked"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"id": "9ff8a894-566b-4c4f-acca-21c50b5b1f52",
|
|
"metadata": {},
|
|
"source": [
|
|
"The images are then processed and stored in a numerical representation, a tensor. These tensors do not change for the same image and same model - so if you run this analysis once, and save the tensors giving a path with the keyword `path_to_save_tensors`, a file with filename `.<Number_of_images>_<model_name>_saved_features_image.pt` will be placed there.\n",
|
|
"\n",
|
|
"This will save you a lot of time if you want to analyse same images with the same model but different questions. To run using the saved tensors, execute the below code giving the path and name of the tensor file."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "56c6d488-f093-4661-835a-5c73a329c874",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:17:20.416684Z",
|
|
"iopub.status.busy": "2023-08-16T08:17:20.416162Z",
|
|
"iopub.status.idle": "2023-08-16T08:17:20.419771Z",
|
|
"shell.execute_reply": "2023-08-16T08:17:20.418954Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# (\n",
|
|
"# model,\n",
|
|
"# vis_processors,\n",
|
|
"# txt_processors,\n",
|
|
"# image_keys,\n",
|
|
"# image_names,\n",
|
|
"# features_image_stacked,\n",
|
|
"# ) = my_obj.parsing_images(\n",
|
|
"# model_type,\n",
|
|
"# path_to_load_tensors=\"/content/drive/MyDrive/misinformation-data/5_clip_base_saved_features_image.pt\",\n",
|
|
"# )"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"id": "309923c1-d6f8-4424-8fca-bde5f3a98b38",
|
|
"metadata": {},
|
|
"source": [
|
|
"Here we already processed our image folder with 5 images and the `clip_base` model. So you need just to write the name `5_clip_base_saved_features_image.pt` of the saved file that consists of tensors of all images as keyword argument for `path_to_load_tensors`. "
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"id": "162a52e8-6652-4897-b92e-645cab07aaef",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Formulate your search queries\n",
|
|
"\n",
|
|
"Next, you need to form search queries. You can search either by image or by text. You can search for a single query, or you can search for several queries at once, the computational time should not be much different. The format of the queries is as follows:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "c4196a52-d01e-42e4-8674-5712f7d6f792",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:17:20.423779Z",
|
|
"iopub.status.busy": "2023-08-16T08:17:20.423066Z",
|
|
"iopub.status.idle": "2023-08-16T08:17:20.427150Z",
|
|
"shell.execute_reply": "2023-08-16T08:17:20.426351Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"search_query3 = [\n",
|
|
" {\"text_input\": \"politician press conference\"},\n",
|
|
" {\"text_input\": \"a world map\"},\n",
|
|
" {\"text_input\": \"a dog\"},\n",
|
|
"]"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"id": "8bcf3127-3dfd-4ff4-b9e7-a043099b1418",
|
|
"metadata": {},
|
|
"source": [
|
|
"You can filter your results in 3 different ways:\n",
|
|
"- `filter_number_of_images` limits the number of images found. That is, if the parameter `filter_number_of_images = 10`, then the first 10 images that best match the query will be shown. The other images ranks will be set to `None` and the similarity value to `0`.\n",
|
|
"- `filter_val_limit` limits the output of images with a similarity value not bigger than `filter_val_limit`. That is, if the parameter `filter_val_limit = 0.2`, all images with similarity less than 0.2 will be discarded.\n",
|
|
"- `filter_rel_error` (percentage) limits the output of images with a similarity value not bigger than `100 * abs(current_simularity_value - best_simularity_value_in_current_search)/best_simularity_value_in_current_search < filter_rel_error`. That is, if we set filter_rel_error = 30, it means that if the top1 image have 0.5 similarity value, we discard all image with similarity less than 0.35."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "7f7dc52f-7ee9-4590-96b7-e0d9d3b82378",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:17:20.430404Z",
|
|
"iopub.status.busy": "2023-08-16T08:17:20.429983Z",
|
|
"iopub.status.idle": "2023-08-16T08:17:20.470136Z",
|
|
"shell.execute_reply": "2023-08-16T08:17:20.469401Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"ename": "NameError",
|
|
"evalue": "name 'model' is not defined",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[14], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m similarity, sorted_lists \u001b[38;5;241m=\u001b[39m my_obj\u001b[38;5;241m.\u001b[39mmultimodal_search(\n\u001b[0;32m----> 2\u001b[0m \u001b[43mmodel\u001b[49m,\n\u001b[1;32m 3\u001b[0m vis_processors,\n\u001b[1;32m 4\u001b[0m txt_processors,\n\u001b[1;32m 5\u001b[0m model_type,\n\u001b[1;32m 6\u001b[0m image_keys,\n\u001b[1;32m 7\u001b[0m features_image_stacked,\n\u001b[1;32m 8\u001b[0m search_query3,\n\u001b[1;32m 9\u001b[0m filter_number_of_images\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m20\u001b[39m,\n\u001b[1;32m 10\u001b[0m )\n",
|
|
"\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"similarity, sorted_lists = my_obj.multimodal_search(\n",
|
|
" model,\n",
|
|
" vis_processors,\n",
|
|
" txt_processors,\n",
|
|
" model_type,\n",
|
|
" image_keys,\n",
|
|
" features_image_stacked,\n",
|
|
" search_query3,\n",
|
|
" filter_number_of_images=20,\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "65210ca2-b674-44bd-807a-4165e14bad74",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:17:20.474037Z",
|
|
"iopub.status.busy": "2023-08-16T08:17:20.473611Z",
|
|
"iopub.status.idle": "2023-08-16T08:17:20.514967Z",
|
|
"shell.execute_reply": "2023-08-16T08:17:20.514028Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"ename": "NameError",
|
|
"evalue": "name 'similarity' is not defined",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[15], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msimilarity\u001b[49m\n",
|
|
"\u001b[0;31mNameError\u001b[0m: name 'similarity' is not defined"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"similarity"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "557473df-e2b9-4ef0-9439-3daadf6741ac",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:17:20.518830Z",
|
|
"iopub.status.busy": "2023-08-16T08:17:20.518185Z",
|
|
"iopub.status.idle": "2023-08-16T08:17:20.557391Z",
|
|
"shell.execute_reply": "2023-08-16T08:17:20.556479Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"ename": "NameError",
|
|
"evalue": "name 'sorted_lists' is not defined",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msorted_lists\u001b[49m\n",
|
|
"\u001b[0;31mNameError\u001b[0m: name 'sorted_lists' is not defined"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"sorted_lists"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"id": "c93d7e88-594d-4095-b5f2-7bf01210dc61",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:17:20.561114Z",
|
|
"iopub.status.busy": "2023-08-16T08:17:20.560581Z",
|
|
"iopub.status.idle": "2023-08-16T08:17:20.565910Z",
|
|
"shell.execute_reply": "2023-08-16T08:17:20.565096Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'102730_eng': {'filename': 'data/102730_eng.png'},\n",
|
|
" '102141_2_eng': {'filename': 'data/102141_2_eng.png'},\n",
|
|
" '106349S_por': {'filename': 'data/106349S_por.png'}}"
|
|
]
|
|
},
|
|
"execution_count": 17,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"mydict"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"id": "e1cf7e46-0c2c-4fb2-b89a-ef585ccb9339",
|
|
"metadata": {},
|
|
"source": [
|
|
"After launching `multimodal_search` function, the results of each query will be added to the source dictionary. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"id": "9ad74b21-6187-4a58-9ed8-fd3e80f5a4ed",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:17:20.570455Z",
|
|
"iopub.status.busy": "2023-08-16T08:17:20.569839Z",
|
|
"iopub.status.idle": "2023-08-16T08:17:20.575115Z",
|
|
"shell.execute_reply": "2023-08-16T08:17:20.574320Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'filename': 'data/106349S_por.png'}"
|
|
]
|
|
},
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"mydict[\"106349S_por\"]"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"id": "cd3ee120-8561-482b-a76a-e8f996783325",
|
|
"metadata": {},
|
|
"source": [
|
|
"A special function was written to present the search results conveniently. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"id": "4324e4fd-e9aa-4933-bb12-074d54e0c510",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:17:20.579431Z",
|
|
"iopub.status.busy": "2023-08-16T08:17:20.578988Z",
|
|
"iopub.status.idle": "2023-08-16T08:17:20.687595Z",
|
|
"shell.execute_reply": "2023-08-16T08:17:20.686391Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'Your search query: politician press conference'"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'--------------------------------------------------'"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'Results:'"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"ename": "KeyError",
|
|
"evalue": "'politician press conference'",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[19], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mmy_obj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshow_results\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43msearch_query3\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m)\u001b[49m\n",
|
|
"File \u001b[0;32m~/work/AMMICO/AMMICO/ammico/multimodal_search.py:970\u001b[0m, in \u001b[0;36mMultimodalSearch.show_results\u001b[0;34m(self, query, itm, image_gradcam_with_itm)\u001b[0m\n\u001b[1;32m 967\u001b[0m current_querry_val \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(query\u001b[38;5;241m.\u001b[39mvalues())[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 968\u001b[0m current_querry_rank \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrank \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlist\u001b[39m(query\u001b[38;5;241m.\u001b[39mvalues())[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m--> 970\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28;43msorted\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 971\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msubdict\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mitems\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[43mcurrent_querry_val\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreverse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\n\u001b[1;32m 972\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 973\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m s[\u001b[38;5;241m1\u001b[39m][current_querry_rank] \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 974\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n",
|
|
"File \u001b[0;32m~/work/AMMICO/AMMICO/ammico/multimodal_search.py:971\u001b[0m, in \u001b[0;36mMultimodalSearch.show_results.<locals>.<lambda>\u001b[0;34m(t)\u001b[0m\n\u001b[1;32m 967\u001b[0m current_querry_val \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(query\u001b[38;5;241m.\u001b[39mvalues())[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 968\u001b[0m current_querry_rank \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrank \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlist\u001b[39m(query\u001b[38;5;241m.\u001b[39mvalues())[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 970\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28msorted\u001b[39m(\n\u001b[0;32m--> 971\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msubdict\u001b[38;5;241m.\u001b[39mitems(), key\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mlambda\u001b[39;00m t: \u001b[43mt\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[43mcurrent_querry_val\u001b[49m\u001b[43m]\u001b[49m, reverse\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 972\u001b[0m ):\n\u001b[1;32m 973\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m s[\u001b[38;5;241m1\u001b[39m][current_querry_rank] \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 974\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n",
|
|
"\u001b[0;31mKeyError\u001b[0m: 'politician press conference'"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"my_obj.show_results(\n",
|
|
" search_query3[0],\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"id": "0b750e9f-fe64-4028-9caf-52d7187462f1",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Improve the search results\n",
|
|
"\n",
|
|
"For even better results, a slightly different approach has been prepared that can improve search results. It is quite resource-intensive, so it is applied after the main algorithm has found the most relevant images. This approach works only with text queries. Among the parameters you can choose 3 models: `\"blip_base\"`, `\"blip_large\"`, `\"blip2_coco\"`. If you get an `Out of Memory` error, try reducing the batch_size value (minimum = 1), which is the number of images being processed simultaneously. With the parameter `need_grad_cam = True/False` you can enable the calculation of the heat map of each image to be processed. Thus the `image_text_match_reordering` function calculates new similarity values and new ranks for each image. The resulting values are added to the general dictionary."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"id": "b3af7b39-6d0d-4da3-9b8f-7dfd3f5779be",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:17:20.692699Z",
|
|
"iopub.status.busy": "2023-08-16T08:17:20.692152Z",
|
|
"iopub.status.idle": "2023-08-16T08:17:20.695909Z",
|
|
"shell.execute_reply": "2023-08-16T08:17:20.695091Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"itm_model = \"blip_base\"\n",
|
|
"# itm_model = \"blip_large\"\n",
|
|
"# itm_model = \"blip2_coco\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"id": "caf1f4ae-4b37-4954-800e-7120f0419de5",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:17:20.699533Z",
|
|
"iopub.status.busy": "2023-08-16T08:17:20.699105Z",
|
|
"iopub.status.idle": "2023-08-16T08:17:20.742251Z",
|
|
"shell.execute_reply": "2023-08-16T08:17:20.741288Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"ename": "NameError",
|
|
"evalue": "name 'image_keys' is not defined",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[21], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m itm_scores, image_gradcam_with_itm \u001b[38;5;241m=\u001b[39m my_obj\u001b[38;5;241m.\u001b[39mimage_text_match_reordering(\n\u001b[1;32m 2\u001b[0m search_query3,\n\u001b[1;32m 3\u001b[0m itm_model,\n\u001b[0;32m----> 4\u001b[0m \u001b[43mimage_keys\u001b[49m,\n\u001b[1;32m 5\u001b[0m sorted_lists,\n\u001b[1;32m 6\u001b[0m batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m,\n\u001b[1;32m 7\u001b[0m need_grad_cam\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 8\u001b[0m )\n",
|
|
"\u001b[0;31mNameError\u001b[0m: name 'image_keys' is not defined"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"itm_scores, image_gradcam_with_itm = my_obj.image_text_match_reordering(\n",
|
|
" search_query3,\n",
|
|
" itm_model,\n",
|
|
" image_keys,\n",
|
|
" sorted_lists,\n",
|
|
" batch_size=1,\n",
|
|
" need_grad_cam=True,\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"id": "9e98c150-5fab-4251-bce7-0d8fc7b385b9",
|
|
"metadata": {},
|
|
"source": [
|
|
"Then using the same output function you can add the `ITM=True` arguments to output the new image order. You can also add the `image_gradcam_with_itm` argument to output the heat maps of the calculated images. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"id": "6a829b99-5230-463a-8b11-30ffbb67fc3a",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:17:20.747193Z",
|
|
"iopub.status.busy": "2023-08-16T08:17:20.746596Z",
|
|
"iopub.status.idle": "2023-08-16T08:17:20.786202Z",
|
|
"shell.execute_reply": "2023-08-16T08:17:20.785348Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"ename": "NameError",
|
|
"evalue": "name 'image_gradcam_with_itm' is not defined",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[22], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m my_obj\u001b[38;5;241m.\u001b[39mshow_results(\n\u001b[0;32m----> 2\u001b[0m search_query3[\u001b[38;5;241m0\u001b[39m], itm\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, image_gradcam_with_itm\u001b[38;5;241m=\u001b[39m\u001b[43mimage_gradcam_with_itm\u001b[49m\n\u001b[1;32m 3\u001b[0m )\n",
|
|
"\u001b[0;31mNameError\u001b[0m: name 'image_gradcam_with_itm' is not defined"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"my_obj.show_results(\n",
|
|
" search_query3[0], itm=True, image_gradcam_with_itm=image_gradcam_with_itm\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"id": "d86ab96b-1907-4b7f-a78e-3983b516d781",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"## Save search results to csv"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"id": "4bdbc4d4-695d-4751-ab7c-d2d98e2917d7",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"Convert the dictionary of dictionarys into a dictionary with lists:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"id": "6c6ddd83-bc87-48f2-a8d6-1bd3f4201ff7",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:17:20.790474Z",
|
|
"iopub.status.busy": "2023-08-16T08:17:20.789727Z",
|
|
"iopub.status.idle": "2023-08-16T08:17:20.794132Z",
|
|
"shell.execute_reply": "2023-08-16T08:17:20.793399Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"outdict = mutils.append_data_to_dict(mydict)\n",
|
|
"df = mutils.dump_df(outdict)"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"id": "ea2675d5-604c-45e7-86d2-080b1f4559a0",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"Check the dataframe:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"id": "e78646d6-80be-4d3e-8123-3360957bcaa8",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:17:20.797387Z",
|
|
"iopub.status.busy": "2023-08-16T08:17:20.796976Z",
|
|
"iopub.status.idle": "2023-08-16T08:17:20.806088Z",
|
|
"shell.execute_reply": "2023-08-16T08:17:20.805297Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>filename</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>data/102730_eng.png</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>data/102141_2_eng.png</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>data/106349S_por.png</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" filename\n",
|
|
"0 data/102730_eng.png\n",
|
|
"1 data/102141_2_eng.png\n",
|
|
"2 data/106349S_por.png"
|
|
]
|
|
},
|
|
"execution_count": 24,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df.head(10)"
|
|
]
|
|
},
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"id": "05546d99-afab-4565-8f30-f14e1426abcf",
|
|
"metadata": {},
|
|
"source": [
|
|
"Write the csv file:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"id": "185f7dde-20dc-44d8-9ab0-de41f9b5734d",
|
|
"metadata": {
|
|
"execution": {
|
|
"iopub.execute_input": "2023-08-16T08:17:20.809662Z",
|
|
"iopub.status.busy": "2023-08-16T08:17:20.809120Z",
|
|
"iopub.status.idle": "2023-08-16T08:17:20.814573Z",
|
|
"shell.execute_reply": "2023-08-16T08:17:20.813902Z"
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.to_csv(\"data/data_out.csv\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b6a79201-7c17-496c-a6a1-b8ecfd3dd1e8",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.17"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|