add image summary notebook

2025-10-29 13:06:04 +02:00 · 2023-01-27 14:04:54 +01:00 · 2023-01-27 14:04:54 +01:00 · 372a09454e
--- a/misinformation/display.py
+++ b/misinformation/display.py
@ -4,6 +4,7 @@ from IPython.display import display
 import misinformation.faces as faces
 import misinformation.text as text
 import misinformation.objects as objects
+import misinformation.summary as summary


 class JSONContainer:
@ -26,6 +27,7 @@ def explore_analysis(mydict, identify="faces"):
        "faces": faces.EmotionDetector,
        "text-on-image": text.TextDetector,
        "objects": objects.ObjectDetector,
+        "summary": summary.SummaryDetector,
    }
    # create a list containing the image ids for the widget
    # image_paths = [mydict[key]["filename"] for key in mydict.keys()]
--- a/misinformation/summary.py
+++ b/misinformation/summary.py
@ -0,0 +1,64 @@
+from misinformation.utils import AnalysisMethod
+import torch
+from PIL import Image
+from lavis.models import load_model_and_preprocess
+
+
+class SummaryDetector(AnalysisMethod):
+    def __init__(
+        self, subdict: dict
+    ) -> None:
+        super().__init__(subdict)
+        self.subdict.update(self.set_keys())
+        self.image_summary = {
+            "const_image_summary": None,
+            "3_non-deterministic summary": None,
+        }
+
+    summary_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    summary_model, summary_vis_processors, _ = load_model_and_preprocess(
+        name="blip_caption", model_type="base_coco", is_eval=True, device=summary_device
+        )
+    
+    def set_keys(self) -> dict:
+        params = {
+            "const_image_summary": None,
+            "3_non-deterministic summary": None,
+        }
+        return params
+
+    def analyse_image(self):
+
+        path = self.subdict["filename"]
+        raw_image = Image.open(path).convert("RGB")
+        image = self.summary_vis_processors["eval"](raw_image).unsqueeze(0).to(self.summary_device)
+        self.image_summary["const_image_summary"] = self.summary_model.generate({"image": image})[0]
+        self.image_summary["3_non-deterministic summary"] = self.summary_model.generate({"image": image}, use_nucleus_sampling=True, num_captions=3)
+        for key in self.image_summary:
+            self.subdict[key] = self.image_summary[key]
+        return self.subdict
+    
+    summary_VQA_model, summary_VQA_vis_processors, summary_VQA_txt_processors = load_model_and_preprocess(name="blip_vqa", model_type="vqav2", is_eval=True, device=summary_device)
+
+    def analyse_questions(self, list_of_questions):
+        if (len(list_of_questions)>0):
+            path = self.subdict["filename"]
+            raw_image = Image.open(path).convert("RGB")
+            image = self.summary_VQA_vis_processors["eval"](raw_image).unsqueeze(0).to(self.summary_device)
+            question_batch =[]
+            for quest in list_of_questions:
+                question_batch.append(self.summary_VQA_txt_processors["eval"](quest))
+            batch_size = len(list_of_questions)
+            image_batch = image.repeat(batch_size, 1, 1, 1)
+
+            answers_batch = self.summary_VQA_model.predict_answers(samples={"image": image_batch, "text_input": question_batch}, inference_method="generate")
+        
+            for q,a in zip(question_batch,answers_batch):
+                self.image_summary[q] = a
+        
+            for key in self.image_summary:
+                self.subdict[key] = self.image_summary[key]
+        else: 
+            print("Please, enter list of questions")
+        return self.subdict    
--- a/notebooks/image_summary.ipynb
+++ b/notebooks/image_summary.ipynb
@ -0,0 +1,665 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Image summary and visual question answering"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebooks shows some preliminary work on Image Captioning and Visual question answering with lavis. It is mainly meant to explore its capabilities and to decide on future research directions. We package our code into a `misinformation` package that is imported here:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-01-27 13:43:45.543761: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
+      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2023-01-27 13:43:45.940025: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/pandriushchenko/anaconda3/envs/misinfo/lib/python3.10/site-packages/cv2/../../lib64:\n",
+      "2023-01-27 13:43:45.940060: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/pandriushchenko/anaconda3/envs/misinfo/lib/python3.10/site-packages/cv2/../../lib64:\n",
+      "2023-01-27 13:43:45.940063: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import misinformation\n",
+    "import misinformation.summary as sm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set an image path as input file path."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "images = misinformation.find_files(\n",
+    "    path=\"../data/images/\",\n",
+    "    limit=1000,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mydict = misinformation.utils.initialize_dict(images)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'100132S_ara': {'filename': '../data/images/100132S_ara.png'},\n",
+       " '100447_ind': {'filename': '../data/images/100447_ind.png'},\n",
+       " '100127S_ara': {'filename': '../data/images/100127S_ara.png'},\n",
+       " '100134S_ara': {'filename': '../data/images/100134S_ara.png'},\n",
+       " '109257_1_spa': {'filename': '../data/images/109257_1_spa.png'},\n",
+       " '100130S_ara': {'filename': '../data/images/100130S_ara.png'},\n",
+       " '100131S_ara': {'filename': '../data/images/100131S_ara.png'},\n",
+       " '102135S_eng': {'filename': '../data/images/102135S_eng.png'},\n",
+       " '102435S_2_eng': {'filename': '../data/images/102435S_2_eng.png'},\n",
+       " '100368_asm': {'filename': '../data/images/100368_asm.png'},\n",
+       " '100361_asm': {'filename': '../data/images/100361_asm.png'},\n",
+       " '102141_1_eng': {'filename': '../data/images/102141_1_eng.png'},\n",
+       " '106958S_por': {'filename': '../data/images/106958S_por.png'},\n",
+       " '102134S_eng': {'filename': '../data/images/102134S_eng.png'},\n",
+       " '102133S_eng': {'filename': '../data/images/102133S_eng.png'},\n",
+       " '100450_ind': {'filename': '../data/images/100450_ind.png'},\n",
+       " '100451S_ind': {'filename': '../data/images/100451S_ind.png'}}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mydict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create captions for images and directly write to csv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for key in mydict:\n",
+    "    mydict[key] = sm.SummaryDetector(mydict[key]).analyse_image()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "Convert the dictionary of dictionarys into a dictionary with lists:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "outdict = misinformation.utils.append_data_to_dict(mydict)\n",
+    "df = misinformation.utils.dump_df(outdict)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Check the dataframe:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filename</th>\n",
+       "      <th>const_image_summary</th>\n",
+       "      <th>3_non-deterministic summary</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>../data/images/100132S_ara.png</td>\n",
+       "      <td>a white car parked in front of a building cove...</td>\n",
+       "      <td>[someone has wrapped up a large plastic bag ov...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>../data/images/100447_ind.png</td>\n",
+       "      <td>a woman drinking from a bottle while standing ...</td>\n",
+       "      <td>[a woman drinks out of a bottle and stands nex...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>../data/images/100127S_ara.png</td>\n",
+       "      <td>a map of the world with arabic writing</td>\n",
+       "      <td>[a map of the world with a message in arabic, ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>../data/images/100134S_ara.png</td>\n",
+       "      <td>a woman is standing in front of a sign</td>\n",
+       "      <td>[two women walking and talking to each other, ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>../data/images/109257_1_spa.png</td>\n",
+       "      <td>a man in a suit and tie making a face</td>\n",
+       "      <td>[a man is smiling and making a funny face, man...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>../data/images/100130S_ara.png</td>\n",
+       "      <td>a group of people walking down a street next t...</td>\n",
+       "      <td>[two people on the street in front of a big tr...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>../data/images/100131S_ara.png</td>\n",
+       "      <td>a group of people standing in front of a tv</td>\n",
+       "      <td>[the president is addressing his nation of the...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>../data/images/102135S_eng.png</td>\n",
+       "      <td>a woman standing in front of a store filled wi...</td>\n",
+       "      <td>[people in a supermarket standing in front of ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>../data/images/102435S_2_eng.png</td>\n",
+       "      <td>a man in a suit and glasses is talking</td>\n",
+       "      <td>[the man is speaking about his favorite tv sho...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>../data/images/100368_asm.png</td>\n",
+       "      <td>a group of people standing next to each other</td>\n",
+       "      <td>[people doing a job next to a line of men, men...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                           filename  \\\n",
+       "0    ../data/images/100132S_ara.png   \n",
+       "1     ../data/images/100447_ind.png   \n",
+       "2    ../data/images/100127S_ara.png   \n",
+       "3    ../data/images/100134S_ara.png   \n",
+       "4   ../data/images/109257_1_spa.png   \n",
+       "5    ../data/images/100130S_ara.png   \n",
+       "6    ../data/images/100131S_ara.png   \n",
+       "7    ../data/images/102135S_eng.png   \n",
+       "8  ../data/images/102435S_2_eng.png   \n",
+       "9     ../data/images/100368_asm.png   \n",
+       "\n",
+       "                                 const_image_summary  \\\n",
+       "0  a white car parked in front of a building cove...   \n",
+       "1  a woman drinking from a bottle while standing ...   \n",
+       "2             a map of the world with arabic writing   \n",
+       "3             a woman is standing in front of a sign   \n",
+       "4              a man in a suit and tie making a face   \n",
+       "5  a group of people walking down a street next t...   \n",
+       "6        a group of people standing in front of a tv   \n",
+       "7  a woman standing in front of a store filled wi...   \n",
+       "8             a man in a suit and glasses is talking   \n",
+       "9      a group of people standing next to each other   \n",
+       "\n",
+       "                         3_non-deterministic summary  \n",
+       "0  [someone has wrapped up a large plastic bag ov...  \n",
+       "1  [a woman drinks out of a bottle and stands nex...  \n",
+       "2  [a map of the world with a message in arabic, ...  \n",
+       "3  [two women walking and talking to each other, ...  \n",
+       "4  [a man is smiling and making a funny face, man...  \n",
+       "5  [two people on the street in front of a big tr...  \n",
+       "6  [the president is addressing his nation of the...  \n",
+       "7  [people in a supermarket standing in front of ...  \n",
+       "8  [the man is speaking about his favorite tv sho...  \n",
+       "9  [people doing a job next to a line of men, men...  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Write the csv file:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_csv(\"./data_out.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Manually inspect the summaries\n",
+    "\n",
+    "To check the analysis, you can inspect the analyzed elements here. Loading the results takes a moment, so please be patient. If you are sure of what you are doing.\n",
+    "\n",
+    "`const_image_summary` - the permanent summarys, which does not change from run to run (analyse_image).\n",
+    "\n",
+    "`3_non-deterministic summary` - 3 different summarys examples that change from run to run (analyse_image). "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0324b10be268470ab4e550cb0153b9e8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(Select(layout=Layout(width='20%'), options=('100132S_ara', '100447_ind', '100127S_ara', '100134…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "misinformation.explore_analysis(mydict, identify=\"summary\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generate answers to free-form questions about images written in natural language. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set the list of questions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_of_questions = [\n",
+    "\"How many persons on the picture?\",\n",
+    "\"Are there any politicians in the picture?\",\n",
+    "\"Does the picture show something from medicine?\",    \n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for key in mydict:\n",
+    "    mydict[key] = sm.SummaryDetector(mydict[key]).analyse_questions(list_of_questions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e446aa565f6345ab8256771f578fbf92",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(Select(layout=Layout(width='20%'), options=('100132S_ara', '100447_ind', '100127S_ara', '100134…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "misinformation.explore_analysis(mydict, identify=\"summary\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Convert the dictionary of dictionarys into a dictionary with lists:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "outdict2 = misinformation.utils.append_data_to_dict(mydict)\n",
+    "df2 = misinformation.utils.dump_df(outdict2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filename</th>\n",
+       "      <th>const_image_summary</th>\n",
+       "      <th>3_non-deterministic summary</th>\n",
+       "      <th>how many persons on the picture?</th>\n",
+       "      <th>are there any politicians in the picture?</th>\n",
+       "      <th>does the picture show something from medicine?</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>../data/images/100132S_ara.png</td>\n",
+       "      <td>a white car parked in front of a building cove...</td>\n",
+       "      <td>[the man is sitting on a car near a large bann...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>no</td>\n",
+       "      <td>no</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>../data/images/100447_ind.png</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>2</td>\n",
+       "      <td>no</td>\n",
+       "      <td>yes</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>../data/images/100127S_ara.png</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>no</td>\n",
+       "      <td>no</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>../data/images/100134S_ara.png</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>2</td>\n",
+       "      <td>no</td>\n",
+       "      <td>yes</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>../data/images/109257_1_spa.png</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>1</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>no</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>../data/images/100130S_ara.png</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>3</td>\n",
+       "      <td>no</td>\n",
+       "      <td>no</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>../data/images/100131S_ara.png</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>many</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>no</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>../data/images/102135S_eng.png</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>6</td>\n",
+       "      <td>no</td>\n",
+       "      <td>no</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>../data/images/102435S_2_eng.png</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>1</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>no</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>../data/images/100368_asm.png</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>15</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>no</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                           filename  \\\n",
+       "0    ../data/images/100132S_ara.png   \n",
+       "1     ../data/images/100447_ind.png   \n",
+       "2    ../data/images/100127S_ara.png   \n",
+       "3    ../data/images/100134S_ara.png   \n",
+       "4   ../data/images/109257_1_spa.png   \n",
+       "5    ../data/images/100130S_ara.png   \n",
+       "6    ../data/images/100131S_ara.png   \n",
+       "7    ../data/images/102135S_eng.png   \n",
+       "8  ../data/images/102435S_2_eng.png   \n",
+       "9     ../data/images/100368_asm.png   \n",
+       "\n",
+       "                                 const_image_summary  \\\n",
+       "0  a white car parked in front of a building cove...   \n",
+       "1                                               None   \n",
+       "2                                               None   \n",
+       "3                                               None   \n",
+       "4                                               None   \n",
+       "5                                               None   \n",
+       "6                                               None   \n",
+       "7                                               None   \n",
+       "8                                               None   \n",
+       "9                                               None   \n",
+       "\n",
+       "                         3_non-deterministic summary  \\\n",
+       "0  [the man is sitting on a car near a large bann...   \n",
+       "1                                               None   \n",
+       "2                                               None   \n",
+       "3                                               None   \n",
+       "4                                               None   \n",
+       "5                                               None   \n",
+       "6                                               None   \n",
+       "7                                               None   \n",
+       "8                                               None   \n",
+       "9                                               None   \n",
+       "\n",
+       "  how many persons on the picture? are there any politicians in the picture?  \\\n",
+       "0                                1                                        no   \n",
+       "1                                2                                        no   \n",
+       "2                                0                                        no   \n",
+       "3                                2                                        no   \n",
+       "4                                1                                       yes   \n",
+       "5                                3                                        no   \n",
+       "6                             many                                       yes   \n",
+       "7                                6                                        no   \n",
+       "8                                1                                       yes   \n",
+       "9                               15                                       yes   \n",
+       "\n",
+       "  does the picture show something from medicine?  \n",
+       "0                                             no  \n",
+       "1                                            yes  \n",
+       "2                                             no  \n",
+       "3                                            yes  \n",
+       "4                                             no  \n",
+       "5                                             no  \n",
+       "6                                             no  \n",
+       "7                                             no  \n",
+       "8                                             no  \n",
+       "9                                             no  "
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df2.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2.to_csv(\"./data_out2.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "f1142466f556ab37fe2d38e2897a16796906208adb09fea90ba58bdf8a56f0ba"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -12,6 +12,7 @@ readme = "README.md"
 maintainers = [
    { name = "Inga Ulusoy", email = "ssc@iwr.uni-heidelberg.de" },
    { name = "Dominic Kempf", email = "ssc@iwr.uni-heidelberg.de" },
+    { name = "Petr Andriushchenko", email = "ssc@iwr.uni-heidelberg.de" },
 ]
 requires-python = ">=3.9"
 license = { text = "MIT" }
@ -45,6 +46,8 @@ dependencies = [
    "jupyterlab",
    "spacytextblob",
    "textblob",
+    "torch",
+    "salesforce-lavis @ git+https://github.com/salesforce/LAVIS.git@main"
 ]

 [project.scripts]