diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d8f5e53..f9207b0 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-22.04, windows-latest]
+        os: [ubuntu-22.04,windows-latest]
         python-version: [3.9]
     steps:
     - name: Checkout repository
@@ -32,7 +32,7 @@ jobs:
     - name: Run pytest
       run: |
         cd misinformation
-        python -m pytest -s -m "not gcv" --cov=. --cov-report=xml
+        python -m pytest --cov=. --cov-report=xml
     - name: Upload coverage
       if: matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.9'
       uses: codecov/codecov-action@v3
diff --git a/README.md b/README.md
index b058828..6013352 100644
--- a/README.md
+++ b/README.md
@@ -29,6 +29,20 @@ pip install .
 ```
 This will install the package and its dependencies locally.
 
+## Installation on Windows
+
+Some modules use [lavis]() to anaylse image content. To enable this functionality on Windows OS, you need to install some dependencies that are not available by default or can be obtained from the command line:
+1. Download [Visual C++](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170) and install (see also [here](https://github.com/philferriere/cocoapi)).
+1. Then install the coco API from Github
+```
+pip install "git+https://github.com/philferriere/cocoapi.git#egg=pycocotools&subdirectory=PythonAPI"
+```
+1. Now you can install the package by navigating to the misinformation directory and typing
+```
+pip install .
+```
+in the command prompt.
+
 # Usage
 
 There are sample notebooks in the `misinformation/notebooks` folder for you to explore the package:
diff --git a/misinformation/__init__.py b/misinformation/__init__.py
index dd3f43c..5b14888 100644
--- a/misinformation/__init__.py
+++ b/misinformation/__init__.py
@@ -8,11 +8,3 @@ except ImportError:
 # Export the version defined in project metadata
 __version__ = metadata.version(__package__)
 del metadata
-
-from misinformation.display import explore_analysis
-from misinformation.utils import (
-    find_files,
-    initialize_dict,
-    append_data_to_dict,
-    dump_df,
-)
diff --git a/misinformation/display.py b/misinformation/display.py
index 40bb8f4..72851a9 100644
--- a/misinformation/display.py
+++ b/misinformation/display.py
@@ -5,6 +5,8 @@ import misinformation.faces as faces
 import misinformation.text as text
 import misinformation.objects as objects
 
+import misinformation.summary as summary
+
 
 class JSONContainer:
     """Expose a Python dictionary as a JSON document in JupyterLab
@@ -26,6 +28,7 @@ def explore_analysis(mydict, identify="faces"):
         "faces": faces.EmotionDetector,
         "text-on-image": text.TextDetector,
         "objects": objects.ObjectDetector,
+        "summary": summary.SummaryDetector,
     }
     # create a list containing the image ids for the widget
     # image_paths = [mydict[key]["filename"] for key in mydict.keys()]
diff --git a/misinformation/faces.py b/misinformation/faces.py
index 76ac369..18265ac 100644
--- a/misinformation/faces.py
+++ b/misinformation/faces.py
@@ -141,7 +141,7 @@ class EmotionDetector(utils.AnalysisMethod):
             DeepFace.analyze(
                 img_path=face,
                 actions=actions,
-                prog_bar=False,
+                silent=True,
                 detector_backend="skip",
             )
         )
diff --git a/misinformation/multimodal_search.py b/misinformation/multimodal_search.py
new file mode 100644
index 0000000..3405493
--- /dev/null
+++ b/misinformation/multimodal_search.py
@@ -0,0 +1,358 @@
+from misinformation.utils import AnalysisMethod
+import torch
+import torch.nn.functional as Func
+import requests
+import lavis
+from PIL import Image
+from IPython.display import display
+from lavis.models import load_model_and_preprocess
+
+
+class MultimodalSearch(AnalysisMethod):
+    def __init__(self, subdict: dict) -> None:
+        super().__init__(subdict)
+        # self.subdict.update(self.set_keys())
+
+    multimodal_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    def load_feature_extractor_model_blip2(self, device):
+        model, vis_processors, txt_processors = load_model_and_preprocess(
+            name="blip2_feature_extractor",
+            model_type="pretrain",
+            is_eval=True,
+            device=device,
+        )
+        return model, vis_processors, txt_processors
+
+    def load_feature_extractor_model_blip(self, device):
+        model, vis_processors, txt_processors = load_model_and_preprocess(
+            name="blip_feature_extractor",
+            model_type="base",
+            is_eval=True,
+            device=device,
+        )
+        return model, vis_processors, txt_processors
+
+    def load_feature_extractor_model_albef(self, device):
+        model, vis_processors, txt_processors = load_model_and_preprocess(
+            name="albef_feature_extractor",
+            model_type="base",
+            is_eval=True,
+            device=device,
+        )
+        return model, vis_processors, txt_processors
+
+    def load_feature_extractor_model_clip_base(self, device):
+        model, vis_processors, txt_processors = load_model_and_preprocess(
+            name="clip_feature_extractor",
+            model_type="base",
+            is_eval=True,
+            device=device,
+        )
+        return model, vis_processors, txt_processors
+
+    def load_feature_extractor_model_clip_vitl14(self, device):
+        model, vis_processors, txt_processors = load_model_and_preprocess(
+            name="clip_feature_extractor",
+            model_type="ViT-L-14",
+            is_eval=True,
+            device=device,
+        )
+        return model, vis_processors, txt_processors
+
+    def load_feature_extractor_model_clip_vitl14_336(self, device):
+        model, vis_processors, txt_processors = load_model_and_preprocess(
+            name="clip_feature_extractor",
+            model_type="ViT-L-14-336",
+            is_eval=True,
+            device=device,
+        )
+        return model, vis_processors, txt_processors
+
+    def read_img(self, filepath):
+        raw_image = Image.open(filepath).convert("RGB")
+        return raw_image
+
+    def read_and_process_images(self, image_paths, vis_processor):
+        raw_images = [MultimodalSearch.read_img(self, path) for path in image_paths]
+        images = [
+            vis_processor["eval"](r_img)
+            .unsqueeze(0)
+            .to(MultimodalSearch.multimodal_device)
+            for r_img in raw_images
+        ]
+        images_tensors = torch.stack(images)
+
+        return raw_images, images_tensors
+
+    def extract_image_features_blip2(self, model, images_tensors):
+        with torch.cuda.amp.autocast(
+            enabled=(MultimodalSearch.multimodal_device != torch.device("cpu"))
+        ):
+            features_image = [
+                model.extract_features({"image": ten, "text_input": ""}, mode="image")
+                for ten in images_tensors
+            ]
+            features_image_stacked = torch.stack(
+                [feat.image_embeds_proj[:, 0, :].squeeze(0) for feat in features_image]
+            )
+        return features_image_stacked
+
+    def extract_image_features_clip(self, model, images_tensors):
+        features_image = [
+            model.extract_features({"image": ten}) for ten in images_tensors
+        ]
+        features_image_stacked = torch.stack(
+            [Func.normalize(feat.float(), dim=-1).squeeze(0) for feat in features_image]
+        )
+        return features_image_stacked
+
+    def extract_image_features_basic(self, model, images_tensors):
+        features_image = [
+            model.extract_features({"image": ten, "text_input": ""}, mode="image")
+            for ten in images_tensors
+        ]
+        features_image_stacked = torch.stack(
+            [feat.image_embeds_proj[:, 0, :].squeeze(0) for feat in features_image]
+        )
+        return features_image_stacked
+
+    def save_tensors(
+        self, model_type, features_image_stacked, name="saved_features_image.pt"
+    ):
+        with open(
+            str(len(features_image_stacked)) + "_" + model_type + "_" + name, "wb"
+        ) as f:
+            torch.save(features_image_stacked, f)
+        return name
+
+    def load_tensors(self, name="saved_features_image.pt"):
+        features_image_stacked = torch.load(name)
+        return features_image_stacked
+
+    def extract_text_features(self, model, text_input):
+        sample_text = {"text_input": [text_input]}
+        features_text = model.extract_features(sample_text, mode="text")
+
+        return features_text
+
+    def parsing_images(self, model_type, path_to_saved_tensors=None):
+
+        if model_type in ("clip_base", "clip_vitl14_336", "clip_vitl14"):
+            path_to_lib = lavis.__file__[:-11] + "models/clip_models/"
+            url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz"
+            r = requests.get(url, allow_redirects=False)
+            open(path_to_lib + "bpe_simple_vocab_16e6.txt.gz", "wb").write(r.content)
+
+        image_keys = sorted(self.keys())
+        image_names = [self[k]["filename"] for k in image_keys]
+
+        select_model = {
+            "blip2": MultimodalSearch.load_feature_extractor_model_blip2,
+            "blip": MultimodalSearch.load_feature_extractor_model_blip,
+            "albef": MultimodalSearch.load_feature_extractor_model_albef,
+            "clip_base": MultimodalSearch.load_feature_extractor_model_clip_base,
+            "clip_vitl14": MultimodalSearch.load_feature_extractor_model_clip_vitl14,
+            "clip_vitl14_336": MultimodalSearch.load_feature_extractor_model_clip_vitl14_336,
+        }
+
+        select_extract_image_features = {
+            "blip2": MultimodalSearch.extract_image_features_blip2,
+            "blip": MultimodalSearch.extract_image_features_basic,
+            "albef": MultimodalSearch.extract_image_features_basic,
+            "clip_base": MultimodalSearch.extract_image_features_clip,
+            "clip_vitl14": MultimodalSearch.extract_image_features_clip,
+            "clip_vitl14_336": MultimodalSearch.extract_image_features_clip,
+        }
+
+        if model_type in select_model.keys():
+            (model, vis_processors, txt_processors,) = select_model[
+                model_type
+            ](self, MultimodalSearch.multimodal_device)
+        else:
+            raise SyntaxError(
+                "Please, use one of the following models: blip2, blip, albef, clip_base, clip_vitl14, clip_vitl14_336"
+            )
+
+        raw_images, images_tensors = MultimodalSearch.read_and_process_images(
+            self, image_names, vis_processors
+        )
+        if path_to_saved_tensors is None:
+            with torch.no_grad():
+                features_image_stacked = select_extract_image_features[model_type](
+                    self, model, images_tensors
+                )
+            MultimodalSearch.save_tensors(self, model_type, features_image_stacked)
+        else:
+            features_image_stacked = MultimodalSearch.load_tensors(
+                self, str(path_to_saved_tensors)
+            )
+
+        return (
+            model,
+            vis_processors,
+            txt_processors,
+            image_keys,
+            image_names,
+            features_image_stacked,
+        )
+
+    def querys_processing(
+        self, search_query, model, txt_processors, vis_processors, model_type
+    ):
+
+        select_extract_image_features = {
+            "blip2": MultimodalSearch.extract_image_features_blip2,
+            "blip": MultimodalSearch.extract_image_features_basic,
+            "albef": MultimodalSearch.extract_image_features_basic,
+            "clip_base": MultimodalSearch.extract_image_features_clip,
+            "clip_vitl14": MultimodalSearch.extract_image_features_clip,
+            "clip_vitl14_336": MultimodalSearch.extract_image_features_clip,
+        }
+
+        for query in search_query:
+            if not (len(query) == 1) and (query in ("image", "text_input")):
+                raise SyntaxError(
+                    'Each querry must contain either an "image" or a "text_input"'
+                )
+        multi_sample = []
+        for query in search_query:
+            if "text_input" in query.keys():
+                text_processing = txt_processors["eval"](query["text_input"])
+                images_tensors = ""
+            elif "image" in query.keys():
+                _, images_tensors = MultimodalSearch.read_and_process_images(
+                    self, [query["image"]], vis_processors
+                )
+                text_processing = ""
+            multi_sample.append(
+                {"image": images_tensors, "text_input": text_processing}
+            )
+
+        multi_features_query = []
+        for query in multi_sample:
+            if query["image"] == "":
+                if model_type in ("clip_base", "clip_vitl14_336", "clip_vitl14"):
+                    features = model.extract_features(
+                        {"text_input": query["text_input"]}
+                    )
+
+                    features_squeeze = features.squeeze(0).to(
+                        MultimodalSearch.multimodal_device
+                    )
+                    multi_features_query.append(
+                        Func.normalize(features_squeeze, dim=-1)
+                    )
+                else:
+                    features = model.extract_features(query, mode="text")
+                    features_squeeze = (
+                        features.text_embeds_proj[:, 0, :]
+                        .squeeze(0)
+                        .to(MultimodalSearch.multimodal_device)
+                    )
+                    multi_features_query.append(features_squeeze)
+            if query["text_input"] == "":
+                multi_features_query.append(
+                    select_extract_image_features[model_type](
+                        self, model, query["image"]
+                    )
+                )
+
+        multi_features_stacked = torch.stack(
+            [query.squeeze(0) for query in multi_features_query]
+        ).to(MultimodalSearch.multimodal_device)
+
+        return multi_features_stacked
+
+    def multimodal_search(
+        self,
+        model,
+        vis_processors,
+        txt_processors,
+        model_type,
+        image_keys,
+        features_image_stacked,
+        search_query,
+        filter_number_of_images=None,
+        filter_val_limit=None,
+        filter_rel_error=None,
+    ):
+        if filter_number_of_images is None:
+            filter_number_of_images = len(self)
+        if filter_val_limit is None:
+            filter_val_limit = 0
+        if filter_rel_error is None:
+            filter_rel_error = 1e10
+
+        features_image_stacked.to(MultimodalSearch.multimodal_device)
+
+        with torch.no_grad():
+            multi_features_stacked = MultimodalSearch.querys_processing(
+                self, search_query, model, txt_processors, vis_processors, model_type
+            )
+
+        similarity = features_image_stacked @ multi_features_stacked.t()
+        # similarity_soft_max = torch.nn.Softmax(dim=0)(similarity / 0.01)
+        sorted_lists = [
+            sorted(range(len(similarity)), key=lambda k: similarity[k, i], reverse=True)
+            for i in range(len(similarity[0]))
+        ]
+        places = [[item.index(i) for i in range(len(item))] for item in sorted_lists]
+
+        for q in range(len(search_query)):
+            max_val = similarity[sorted_lists[q][0]][q].item()
+            print(max_val)
+            for i, key in zip(range(len(image_keys)), sorted_lists[q]):
+                if (
+                    i < filter_number_of_images
+                    and similarity[key][q].item() > filter_val_limit
+                    and 100 * abs(max_val - similarity[key][q].item()) / max_val
+                    < filter_rel_error
+                ):
+                    self[image_keys[key]][
+                        "rank " + list(search_query[q].values())[0]
+                    ] = places[q][key]
+                    self[image_keys[key]][
+                        list(search_query[q].values())[0]
+                    ] = similarity[key][q].item()
+                else:
+                    self[image_keys[key]][
+                        "rank " + list(search_query[q].values())[0]
+                    ] = None
+                    self[image_keys[key]][list(search_query[q].values())[0]] = 0
+        return similarity, sorted_lists
+
+    def show_results(self, query):
+        if "image" in query.keys():
+            pic = Image.open(query["image"]).convert("RGB")
+            pic.thumbnail((400, 400))
+            display(
+                "Your search query: ",
+                pic,
+                "--------------------------------------------------",
+                "Results:",
+            )
+        elif "text_input" in query.keys():
+            display(
+                "Your search query: " + query["text_input"],
+                "--------------------------------------------------",
+                "Results:",
+            )
+        for s in sorted(
+            self.items(), key=lambda t: t[1][list(query.values())[0]], reverse=True
+        ):
+            if s[1]["rank " + list(query.values())[0]] is None:
+                break
+            p1 = Image.open(s[1]["filename"]).convert("RGB")
+            p1.thumbnail((400, 400))
+            display(
+                "Rank: "
+                + str(s[1]["rank " + list(query.values())[0]])
+                + " Val: "
+                + str(s[1][list(query.values())[0]]),
+                s[0],
+                p1,
+            )
+            display(
+                "--------------------------------------------------",
+            )
diff --git a/misinformation/objects_cvlib.py b/misinformation/objects_cvlib.py
index 076aa77..556d4f1 100644
--- a/misinformation/objects_cvlib.py
+++ b/misinformation/objects_cvlib.py
@@ -1,5 +1,7 @@
 import cv2
 import cvlib as cv
+import numpy as np
+from PIL import Image
 
 
 def objects_from_cvlib(objects_list: list) -> dict:
@@ -50,7 +52,11 @@ class ObjectCVLib(ObjectsMethod):
         image_path: The path to the local file.
         """
         img = cv2.imread(image_path)
-        bbox, label, conf = cv.detect_common_objects(img)
+        # preimg = Image.open(image_path).convert("RGB")
+        # preimg2 = np.asarray(preimg)
+        # img = cv2.cvtColor(preimg2, cv2.COLOR_BGR2RGB)
+
+        _, label, _ = cv.detect_common_objects(img)
         # output_image = draw_bbox(im, bbox, label, conf)
         objects = objects_from_cvlib(label)
         return objects
diff --git a/misinformation/summary.py b/misinformation/summary.py
new file mode 100644
index 0000000..b348119
--- /dev/null
+++ b/misinformation/summary.py
@@ -0,0 +1,104 @@
+from misinformation.utils import AnalysisMethod
+from torch import device, cuda, no_grad
+from PIL import Image
+from lavis.models import load_model_and_preprocess
+
+
+class SummaryDetector(AnalysisMethod):
+    def __init__(self, subdict: dict) -> None:
+        super().__init__(subdict)
+
+    summary_device = device("cuda" if cuda.is_available() else "cpu")
+    summary_model, summary_vis_processors, _ = load_model_and_preprocess(
+        name="blip_caption",
+        model_type="base_coco",
+        is_eval=True,
+        device=summary_device,
+    )
+
+    def load_model_base(self):
+        summary_device = device("cuda" if cuda.is_available() else "cpu")
+        summary_model, summary_vis_processors, _ = load_model_and_preprocess(
+            name="blip_caption",
+            model_type="base_coco",
+            is_eval=True,
+            device=summary_device,
+        )
+        return summary_model, summary_vis_processors
+
+    def load_model_large(self):
+        summary_device = device("cuda" if cuda.is_available() else "cpu")
+        summary_model, summary_vis_processors, _ = load_model_and_preprocess(
+            name="blip_caption",
+            model_type="large_coco",
+            is_eval=True,
+            device=summary_device,
+        )
+        return summary_model, summary_vis_processors
+
+    def load_model(self, model_type):
+        select_model = {
+            "base": SummaryDetector.load_model_base,
+            "large": SummaryDetector.load_model_large,
+        }
+        summary_model, summary_vis_processors = select_model[model_type](self)
+        return summary_model, summary_vis_processors
+
+    def analyse_image(self, summary_model=None, summary_vis_processors=None):
+
+        if summary_model is None and summary_vis_processors is None:
+            summary_model = SummaryDetector.summary_model
+            summary_vis_processors = SummaryDetector.summary_vis_processors
+
+        path = self.subdict["filename"]
+        raw_image = Image.open(path).convert("RGB")
+        image = (
+            summary_vis_processors["eval"](raw_image)
+            .unsqueeze(0)
+            .to(self.summary_device)
+        )
+        with no_grad():
+            self.subdict["const_image_summary"] = summary_model.generate(
+                {"image": image}
+            )[0]
+            self.subdict["3_non-deterministic summary"] = summary_model.generate(
+                {"image": image}, use_nucleus_sampling=True, num_captions=3
+            )
+        return self.subdict
+
+    (
+        summary_VQA_model,
+        summary_VQA_vis_processors,
+        summary_VQA_txt_processors,
+    ) = load_model_and_preprocess(
+        name="blip_vqa", model_type="vqav2", is_eval=True, device=summary_device
+    )
+
+    def analyse_questions(self, list_of_questions):
+
+        if len(list_of_questions) > 0:
+            path = self.subdict["filename"]
+            raw_image = Image.open(path).convert("RGB")
+            image = (
+                self.summary_VQA_vis_processors["eval"](raw_image)
+                .unsqueeze(0)
+                .to(self.summary_device)
+            )
+            question_batch = []
+            for quest in list_of_questions:
+                question_batch.append(self.summary_VQA_txt_processors["eval"](quest))
+            batch_size = len(list_of_questions)
+            image_batch = image.repeat(batch_size, 1, 1, 1)
+
+            with no_grad():
+                answers_batch = self.summary_VQA_model.predict_answers(
+                    samples={"image": image_batch, "text_input": question_batch},
+                    inference_method="generate",
+                )
+
+            for q, a in zip(list_of_questions, answers_batch):
+                self.subdict[q] = a
+
+        else:
+            print("Please, enter list of questions")
+        return self.subdict
diff --git a/misinformation/test/conftest.py b/misinformation/test/conftest.py
new file mode 100644
index 0000000..3a43142
--- /dev/null
+++ b/misinformation/test/conftest.py
@@ -0,0 +1,18 @@
+import os
+import pytest
+
+
+@pytest.fixture
+def get_path(request):
+    mypath = os.path.dirname(request.module.__file__)
+    mypath = mypath + "/data/"
+    return mypath
+
+
+@pytest.fixture
+def set_environ(request):
+    mypath = os.path.dirname(request.module.__file__)
+    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = (
+        mypath + "/../../data/seismic-bonfire-329406-412821a70264.json"
+    )
+    print(os.environ.get("GOOGLE_APPLICATION_CREDENTIALS"))
diff --git a/misinformation/test/data/IMG_2809.png b/misinformation/test/data/IMG_2809.png
index 689525e..591e560 100644
Binary files a/misinformation/test/data/IMG_2809.png and b/misinformation/test/data/IMG_2809.png differ
diff --git a/misinformation/test/data/IMG_3758.png b/misinformation/test/data/IMG_3758.png
new file mode 100644
index 0000000..bf385ee
Binary files /dev/null and b/misinformation/test/data/IMG_3758.png differ
diff --git a/misinformation/test/data/example_objects_cvlib.json b/misinformation/test/data/example_objects_cvlib.json
index 7e9b9b6..2b0ee03 100644
--- a/misinformation/test/data/example_objects_cvlib.json
+++ b/misinformation/test/data/example_objects_cvlib.json
@@ -1 +1 @@
-{"filename": "./test/data/IMG_2809.png", "person": "yes", "bicycle": "no", "car": "yes", "motorcycle": "no", "airplane": "no", "bus": "yes", "train": "no", "truck": "no", "boat": "no", "traffic light": "no", "cell phone": "no"}
\ No newline at end of file
+{"filename": "IMG_2809.png", "person": "yes", "bicycle": "no", "car": "yes", "motorcycle": "no", "airplane": "no", "bus": "yes", "train": "no", "truck": "no", "boat": "no", "traffic light": "no", "cell phone": "no"}
\ No newline at end of file
diff --git a/misinformation/test/data/text_IMG_3756.txt b/misinformation/test/data/text_IMG_3756.txt
index 49690b1..0e2c9bf 100644
--- a/misinformation/test/data/text_IMG_3756.txt
+++ b/misinformation/test/data/text_IMG_3756.txt
@@ -3,10 +3,10 @@ The Quantum Theory of
 Nonrelativistic Collisions
 JOHN R. TAYLOR
 University of Colorado
-ostaliga Lanbidean
+postaldia Lanbidean
 1 ilde
 ballenger stor goin
-gdĐOL, SIVI 23 TL 02
+gd OOL, STVÍ 23 TL 02
 de in obl
 och yd badalang
 a
diff --git a/misinformation/test/data/text_translated_IMG_3756.txt b/misinformation/test/data/text_translated_IMG_3756.txt
index 04479ee..0e2c9bf 100644
--- a/misinformation/test/data/text_translated_IMG_3756.txt
+++ b/misinformation/test/data/text_translated_IMG_3756.txt
@@ -3,12 +3,12 @@ The Quantum Theory of
 Nonrelativistic Collisions
 JOHN R. TAYLOR
 University of Colorado
-ostaliga Lanbidean
+postaldia Lanbidean
 1 ilde
-balloons big goin
-gdĐOL, SIVI 23 TL
-there in obl
-och yd change
+ballenger stor goin
+gd OOL, STVÍ 23 TL 02
+de in obl
+och yd badalang
 a
 Ber
-ook Sy-RW isn't going anywhere
\ No newline at end of file
+ook Sy-RW enot go baldus
\ No newline at end of file
diff --git a/misinformation/test/test_display.py b/misinformation/test/test_display.py
index ea634f9..fde8a29 100644
--- a/misinformation/test/test_display.py
+++ b/misinformation/test/test_display.py
@@ -1,11 +1,14 @@
 import json
-from misinformation.display import explore_analysis
-from pytest import approx
+
+# import misinformation.display as misinf_display
+import pytest
+
+misinf_display = pytest.importorskip("misinformation.display")
 
 
 def test_explore_analysis_faces():
     mydict = {"IMG_2746": {"filename": "./test/data/IMG_2746.png"}}
-    explore_analysis(mydict, identify="faces")
+    misinf_display.explore_analysis(mydict, identify="faces")
     with open("./test/data/example_faces.json", "r") as file:
         outs = json.load(file)
 
@@ -17,7 +20,7 @@ def test_explore_analysis_faces():
 
 def test_explore_analysis_objects():
     mydict = {"IMG_2746": {"filename": "./test/data/IMG_2809.png"}}
-    explore_analysis(mydict, identify="objects")
+    misinf_display.explore_analysis(mydict, identify="objects")
     with open("./test/data/example_analysis_objects.json", "r") as file:
         outs = json.load(file)
 
diff --git a/misinformation/test/test_multimodal_search.py b/misinformation/test/test_multimodal_search.py
new file mode 100644
index 0000000..badfc55
--- /dev/null
+++ b/misinformation/test/test_multimodal_search.py
@@ -0,0 +1,605 @@
+import pytest
+import math
+from PIL import Image
+import numpy
+from torch import device, cuda
+import misinformation.multimodal_search as ms
+
+testdict = {
+    "d755771b-225e-432f-802e-fb8dc850fff7": {
+        "filename": "./test/data/d755771b-225e-432f-802e-fb8dc850fff7.png"
+    },
+    "IMG_2746": {"filename": "./test/data/IMG_2746.png"},
+    "IMG_2750": {"filename": "./test/data/IMG_2750.png"},
+    "IMG_2805": {"filename": "./test/data/IMG_2805.png"},
+    "IMG_2806": {"filename": "./test/data/IMG_2806.png"},
+    "IMG_2807": {"filename": "./test/data/IMG_2807.png"},
+    "IMG_2808": {"filename": "./test/data/IMG_2808.png"},
+    "IMG_2809": {"filename": "./test/data/IMG_2809.png"},
+    "IMG_3755": {"filename": "./test/data/IMG_3755.jpg"},
+    "IMG_3756": {"filename": "./test/data/IMG_3756.jpg"},
+    "IMG_3757": {"filename": "./test/data/IMG_3757.jpg"},
+    "pic1": {"filename": "./test/data/pic1.png"},
+}
+
+related_error = 1e-3
+gpu_is_not_available = not cuda.is_available()
+
+
+cuda.empty_cache()
+
+
+def test_read_img():
+    my_dict = {}
+    test_img = ms.MultimodalSearch.read_img(my_dict, testdict["IMG_2746"]["filename"])
+    assert list(numpy.array(test_img)[257][34]) == [70, 66, 63]
+
+
+pre_proc_pic_blip2_blip_albef = [
+    -1.0039474964141846,
+    -1.0039474964141846,
+    -0.8433647751808167,
+    -0.6097899675369263,
+    -0.5951915383338928,
+    -0.6243883967399597,
+    -0.6827820539474487,
+    -0.6097899675369263,
+    -0.7119789123535156,
+    -1.0623412132263184,
+]
+pre_proc_pic_clip_vitl14 = [
+    -0.7995694875717163,
+    -0.7849710583686829,
+    -0.7849710583686829,
+    -0.7703726291656494,
+    -0.7703726291656494,
+    -0.7849710583686829,
+    -0.7849710583686829,
+    -0.7703726291656494,
+    -0.7703726291656494,
+    -0.7703726291656494,
+]
+
+pre_proc_pic_clip_vitl14_336 = [
+    -0.7995694875717163,
+    -0.7849710583686829,
+    -0.7849710583686829,
+    -0.7849710583686829,
+    -0.7849710583686829,
+    -0.7849710583686829,
+    -0.7849710583686829,
+    -0.9163569211959839,
+    -1.149931788444519,
+    -1.0039474964141846,
+]
+
+pre_proc_text_blip2_blip_albef = (
+    "the bird sat on a tree located at the intersection of 23rd and 43rd streets"
+)
+
+pre_proc_text_clip_clip_vitl14_clip_vitl14_336 = (
+    "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
+)
+
+pre_extracted_feature_img_blip2 = [
+    0.04566730558872223,
+    -0.042554520070552826,
+    -0.06970272958278656,
+    -0.009771779179573059,
+    0.01446065679192543,
+    0.10173682868480682,
+    0.007092420011758804,
+    -0.020045937970280647,
+    0.12923966348171234,
+    0.006452132016420364,
+]
+
+pre_extracted_feature_img_blip = [
+    -0.02480311505496502,
+    0.05037587881088257,
+    0.039517853409051895,
+    -0.06994109600782394,
+    -0.12886561453342438,
+    0.047039758414030075,
+    -0.11620642244815826,
+    -0.003398326924070716,
+    -0.07324369996786118,
+    0.06994668394327164,
+]
+
+pre_extracted_feature_img_albef = [
+    0.08971136063337326,
+    -0.10915573686361313,
+    -0.020636577159166336,
+    0.048121627420186996,
+    -0.05943416804075241,
+    -0.129856139421463,
+    -0.0034469354432076216,
+    0.017888527363538742,
+    -0.03284582123160362,
+    -0.1037328764796257,
+]
+
+pre_extracted_feature_img_clip = [
+    0.01621132344007492,
+    -0.004035486374050379,
+    -0.04304071143269539,
+    -0.03459808602929115,
+    0.016922621056437492,
+    -0.025056276470422745,
+    -0.04178355261683464,
+    0.02165347896516323,
+    -0.003224249929189682,
+    0.020485712215304375,
+]
+
+pre_extracted_feature_img_parsing_clip = [
+    0.01621132344007492,
+    -0.004035486374050379,
+    -0.04304071143269539,
+    -0.03459808602929115,
+    0.016922621056437492,
+    -0.025056276470422745,
+    -0.04178355261683464,
+    0.02165347896516323,
+    -0.003224249929189682,
+    0.020485712215304375,
+]
+
+pre_extracted_feature_img_clip_vitl14 = [
+    -0.023943455889821053,
+    -0.021703708916902542,
+    0.035043686628341675,
+    0.019495919346809387,
+    0.014351222664117813,
+    -0.008634116500616074,
+    0.01610446907579899,
+    -0.003426523646339774,
+    0.011931191198527813,
+    0.0008691544644534588,
+]
+
+pre_extracted_feature_img_clip_vitl14_336 = [
+    -0.009511193260550499,
+    -0.012618942186236382,
+    0.034754861146211624,
+    0.016356879845261574,
+    -0.0011549904011189938,
+    -0.008054453879594803,
+    0.0011990377679467201,
+    -0.010806051082909107,
+    0.00140204350464046,
+    0.0006861367146484554,
+]
+
+pre_extracted_feature_text_blip2 = [
+    -0.1384204626083374,
+    -0.008662976324558258,
+    0.006269007455557585,
+    0.03151319921016693,
+    0.060558050870895386,
+    -0.03230040520429611,
+    0.015861615538597107,
+    -0.11856459826231003,
+    -0.058296192437410355,
+    0.03699290752410889,
+]
+
+pre_extracted_feature_text_blip = [
+    0.0118643119931221,
+    -0.01291718054562807,
+    -0.0009687161073088646,
+    0.01428765058517456,
+    -0.05591396614909172,
+    0.07386433333158493,
+    -0.11475936323404312,
+    0.01620068959891796,
+    0.0062415082938969135,
+    0.0034833091776818037,
+]
+
+pre_extracted_feature_text_albef = [
+    -0.06229640915989876,
+    0.11278597265481949,
+    0.06628583371639252,
+    0.1649140566587448,
+    0.068987175822258,
+    0.006291372701525688,
+    0.03244050219655037,
+    -0.049556829035282135,
+    0.050752390176057816,
+    -0.0421440489590168,
+]
+
+pre_extracted_feature_text_clip = [
+    0.018169036135077477,
+    0.03634127229452133,
+    0.025660742074251175,
+    0.009149895049631596,
+    -0.035570453852415085,
+    0.033126577734947205,
+    -0.004808237310498953,
+    -0.0031453112605959177,
+    -0.02194291725754738,
+    0.024019461125135422,
+]
+
+pre_extracted_feature_text_clip_vitl14 = [
+    -0.0055463071912527084,
+    0.006908962037414312,
+    -0.019450219348073006,
+    -0.018097277730703354,
+    0.017567576840519905,
+    -0.03828490898013115,
+    -0.03781530633568764,
+    -0.023951737210154533,
+    0.01365653332322836,
+    -0.02341713197529316,
+]
+
+pre_extracted_feature_text_clip_vitl14_336 = [
+    -0.008720514364540577,
+    0.005284308455884457,
+    -0.021116750314831734,
+    -0.018112430348992348,
+    0.01685470901429653,
+    -0.03517491742968559,
+    -0.038612402975559235,
+    -0.021867064759135246,
+    0.01685977540910244,
+    -0.023832324892282486,
+]
+
+simularity_blip2 = [
+    [0.05826476216316223, -0.03215287625789642],
+    [0.12869958579540253, 0.005234059877693653],
+    [0.11073512583971024, 0.12327003479003906],
+    [0.08743024617433548, 0.05598106235265732],
+    [0.04591086134314537, 0.48981112241744995],
+    [0.06297147274017334, 0.4728018641471863],
+    [0.18486255407333374, 0.635167121887207],
+    [0.015356295742094517, 0.015282897278666496],
+    [-0.008485622704029083, 0.010882291942834854],
+    [-0.04328630864620209, -0.13117870688438416],
+    [-0.025470387190580368, 0.13175423443317413],
+    [-0.05090826004743576, 0.05902523919939995],
+]
+
+sorted_blip2 = [
+    [6, 1, 2, 3, 5, 0, 4, 7, 8, 10, 9, 11],
+    [6, 4, 5, 10, 2, 11, 3, 7, 8, 1, 0, 9],
+]
+
+simularity_blip = [
+    [0.15640679001808167, 0.752173662185669],
+    [0.15139800310134888, 0.7804810404777527],
+    [0.13010388612747192, 0.755257248878479],
+    [0.13746635615825653, 0.7618774175643921],
+    [0.1756758838891983, 0.8531903624534607],
+    [0.17233705520629883, 0.8448910117149353],
+    [0.1970970332622528, 0.8916105628013611],
+    [0.11693969368934631, 0.5833531618118286],
+    [0.12386563420295715, 0.5981853604316711],
+    [0.08427951484918594, 0.4962371587753296],
+    [0.14193706214427948, 0.7613846659660339],
+    [0.12051936239004135, 0.6492202281951904],
+]
+
+sorted_blip = [
+    [6, 4, 5, 0, 1, 10, 3, 2, 8, 11, 7, 9],
+    [6, 4, 5, 1, 3, 10, 2, 0, 11, 8, 7, 9],
+]
+
+simularity_albef = [
+    [0.12321824580430984, 0.35511350631713867],
+    [0.09512615948915482, 0.27168408036231995],
+    [0.09053325653076172, 0.20215675234794617],
+    [0.06335515528917313, 0.15055638551712036],
+    [0.09604836255311966, 0.4658776521682739],
+    [0.10870333760976791, 0.5143978595733643],
+    [0.11748822033405304, 0.6542638540267944],
+    [0.05688793584704399, 0.22170542180538177],
+    [0.05597608536481857, 0.11963296681642532],
+    [0.059643782675266266, 0.14969395101070404],
+    [0.06690303236246109, 0.3149859607219696],
+    [0.07909377664327621, 0.11911341547966003],
+]
+
+sorted_albef = [
+    [0, 6, 5, 4, 1, 2, 11, 10, 3, 9, 7, 8],
+    [6, 5, 4, 0, 10, 1, 7, 2, 3, 9, 8, 11],
+]
+
+simularity_clip = [
+    [0.23923014104366302, 0.5325412750244141],
+    [0.20101115107536316, 0.5112978219985962],
+    [0.17522737383842468, 0.49811851978302],
+    [0.20062290132045746, 0.5415266156196594],
+    [0.22865726053714752, 0.5762109756469727],
+    [0.2310466319322586, 0.5910375714302063],
+    [0.2644523084163666, 0.7851459383964539],
+    [0.21474510431289673, 0.4135811924934387],
+    [0.16407863795757294, 0.1474374681711197],
+    [0.19819433987140656, 0.26493316888809204],
+    [0.19545596837997437, 0.5007457137107849],
+    [0.1647854745388031, 0.45705708861351013],
+]
+
+sorted_clip = [
+    [6, 0, 5, 4, 7, 1, 3, 9, 10, 2, 11, 8],
+    [6, 5, 4, 3, 0, 1, 10, 2, 11, 7, 9, 8],
+]
+
+simularity_clip_vitl14 = [
+    [0.1051270067691803, 0.5184808373451233],
+    [0.09705893695354462, 0.49574509263038635],
+    [0.11964304000139236, 0.5424358248710632],
+    [0.13881900906562805, 0.5909714698791504],
+    [0.12728188931941986, 0.6758255362510681],
+    [0.1277746558189392, 0.6841973662376404],
+    [0.18026694655418396, 0.803142786026001],
+    [0.13977059721946716, 0.45957139134407043],
+    [0.11180847883224487, 0.24822194874286652],
+    [0.12296056002378464, 0.35143694281578064],
+    [0.11596094071865082, 0.5704031586647034],
+    [0.10174489766359329, 0.44422751665115356],
+]
+
+sorted_clip_vitl14 = [
+    [6, 7, 3, 5, 4, 9, 2, 10, 8, 0, 11, 1],
+    [6, 5, 4, 3, 10, 2, 0, 1, 7, 11, 9, 8],
+]
+
+simularity_clip_vitl14_336 = [
+    [0.09391091763973236, 0.49337542057037354],
+    [0.11103834211826324, 0.4881117343902588],
+    [0.12891019880771637, 0.5501476526260376],
+    [0.13288410007953644, 0.5498673915863037],
+    [0.12357455492019653, 0.6749162077903748],
+    [0.13700757920742035, 0.7003108263015747],
+    [0.1788637489080429, 0.7713702321052551],
+    [0.13260436058044434, 0.4300197660923004],
+    [0.11666625738143921, 0.2334875613451004],
+    [0.1316065937280655, 0.3291645646095276],
+    [0.12374477833509445, 0.5632147192955017],
+    [0.10333051532506943, 0.43023794889450073],
+]
+
+sorted_clip_vitl14_336 = [
+    [6, 5, 3, 7, 9, 2, 10, 4, 8, 1, 11, 0],
+    [6, 5, 4, 10, 2, 3, 0, 1, 11, 7, 9, 8],
+]
+
+
+@pytest.mark.parametrize(
+    (
+        "pre_multimodal_device",
+        "pre_model",
+        "pre_proc_pic",
+        "pre_proc_text",
+        "pre_extracted_feature_img",
+        "pre_extracted_feature_text",
+        "pre_simularity",
+        "pre_sorted",
+    ),
+    [
+        pytest.param(
+            device("cuda"),
+            "blip2",
+            pre_proc_pic_blip2_blip_albef,
+            pre_proc_text_blip2_blip_albef,
+            pre_extracted_feature_img_blip2,
+            pre_extracted_feature_text_blip2,
+            simularity_blip2,
+            sorted_blip2,
+            marks=pytest.mark.skipif(
+                gpu_is_not_available, reason="gpu_is_not_availible"
+            ),
+        ),
+        (
+            device("cpu"),
+            "blip",
+            pre_proc_pic_blip2_blip_albef,
+            pre_proc_text_blip2_blip_albef,
+            pre_extracted_feature_img_blip,
+            pre_extracted_feature_text_blip,
+            simularity_blip,
+            sorted_blip,
+        ),
+        pytest.param(
+            device("cuda"),
+            "blip",
+            pre_proc_pic_blip2_blip_albef,
+            pre_proc_text_blip2_blip_albef,
+            pre_extracted_feature_img_blip,
+            pre_extracted_feature_text_blip,
+            simularity_blip,
+            sorted_blip,
+            marks=pytest.mark.skipif(
+                gpu_is_not_available, reason="gpu_is_not_availible"
+            ),
+        ),
+        (
+            device("cpu"),
+            "albef",
+            pre_proc_pic_blip2_blip_albef,
+            pre_proc_text_blip2_blip_albef,
+            pre_extracted_feature_img_albef,
+            pre_extracted_feature_text_albef,
+            simularity_albef,
+            sorted_albef,
+        ),
+        pytest.param(
+            device("cuda"),
+            "albef",
+            pre_proc_pic_blip2_blip_albef,
+            pre_proc_text_blip2_blip_albef,
+            pre_extracted_feature_img_albef,
+            pre_extracted_feature_text_albef,
+            simularity_albef,
+            sorted_albef,
+            marks=pytest.mark.skipif(
+                gpu_is_not_available, reason="gpu_is_not_availible"
+            ),
+        ),
+        (
+            device("cpu"),
+            "clip_base",
+            pre_proc_pic_clip_vitl14,
+            pre_proc_text_clip_clip_vitl14_clip_vitl14_336,
+            pre_extracted_feature_img_clip,
+            pre_extracted_feature_text_clip,
+            simularity_clip,
+            sorted_clip,
+        ),
+        pytest.param(
+            device("cuda"),
+            "clip_base",
+            pre_proc_pic_clip_vitl14,
+            pre_proc_text_clip_clip_vitl14_clip_vitl14_336,
+            pre_extracted_feature_img_clip,
+            pre_extracted_feature_text_clip,
+            simularity_clip,
+            sorted_clip,
+            marks=pytest.mark.skipif(
+                gpu_is_not_available, reason="gpu_is_not_availible"
+            ),
+        ),
+        (
+            device("cpu"),
+            "clip_vitl14",
+            pre_proc_pic_clip_vitl14,
+            pre_proc_text_clip_clip_vitl14_clip_vitl14_336,
+            pre_extracted_feature_img_clip_vitl14,
+            pre_extracted_feature_text_clip_vitl14,
+            simularity_clip_vitl14,
+            sorted_clip_vitl14,
+        ),
+        pytest.param(
+            device("cuda"),
+            "clip_vitl14",
+            pre_proc_pic_clip_vitl14,
+            pre_proc_text_clip_clip_vitl14_clip_vitl14_336,
+            pre_extracted_feature_img_clip_vitl14,
+            pre_extracted_feature_text_clip_vitl14,
+            simularity_clip_vitl14,
+            sorted_clip_vitl14,
+            marks=pytest.mark.skipif(
+                gpu_is_not_available, reason="gpu_is_not_availible"
+            ),
+        ),
+        (
+            device("cpu"),
+            "clip_vitl14_336",
+            pre_proc_pic_clip_vitl14_336,
+            pre_proc_text_clip_clip_vitl14_clip_vitl14_336,
+            pre_extracted_feature_img_clip_vitl14_336,
+            pre_extracted_feature_text_clip_vitl14_336,
+            simularity_clip_vitl14_336,
+            sorted_clip_vitl14_336,
+        ),
+        pytest.param(
+            device("cuda"),
+            "clip_vitl14_336",
+            pre_proc_pic_clip_vitl14_336,
+            pre_proc_text_clip_clip_vitl14_clip_vitl14_336,
+            pre_extracted_feature_img_clip_vitl14_336,
+            pre_extracted_feature_text_clip_vitl14_336,
+            simularity_clip_vitl14_336,
+            sorted_clip_vitl14_336,
+            marks=pytest.mark.skipif(
+                gpu_is_not_available, reason="gpu_is_not_availible"
+            ),
+        ),
+    ],
+)
+def test_parsing_images(
+    pre_multimodal_device,
+    pre_model,
+    pre_proc_pic,
+    pre_proc_text,
+    pre_extracted_feature_img,
+    pre_extracted_feature_text,
+    pre_simularity,
+    pre_sorted,
+):
+
+    ms.MultimodalSearch.multimodal_device = pre_multimodal_device
+    (
+        model,
+        vis_processor,
+        txt_processor,
+        image_keys,
+        image_names,
+        features_image_stacked,
+    ) = ms.MultimodalSearch.parsing_images(testdict, pre_model)
+
+    for i, num in zip(range(10), features_image_stacked[0, 10:20].tolist()):
+        assert (
+            math.isclose(num, pre_extracted_feature_img[i], rel_tol=related_error)
+            is True
+        )
+
+    test_pic = Image.open(testdict["IMG_2746"]["filename"]).convert("RGB")
+    test_querry = (
+        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
+    )
+    processed_pic = (
+        vis_processor["eval"](test_pic).unsqueeze(0).to(pre_multimodal_device)
+    )
+    processed_text = txt_processor["eval"](test_querry)
+
+    for i, num in zip(range(10), processed_pic[0, 0, 0, 25:35].tolist()):
+        assert math.isclose(num, pre_proc_pic[i], rel_tol=related_error) is True
+
+    assert processed_text == pre_proc_text
+
+    search_query = [
+        {"text_input": test_querry},
+        {"image": testdict["IMG_2746"]["filename"]},
+    ]
+    multi_features_stacked = ms.MultimodalSearch.querys_processing(
+        testdict, search_query, model, txt_processor, vis_processor, pre_model
+    )
+
+    for i, num in zip(range(10), multi_features_stacked[0, 10:20].tolist()):
+        assert (
+            math.isclose(num, pre_extracted_feature_text[i], rel_tol=related_error)
+            is True
+        )
+
+    for i, num in zip(range(10), multi_features_stacked[1, 10:20].tolist()):
+        assert (
+            math.isclose(num, pre_extracted_feature_img[i], rel_tol=related_error)
+            is True
+        )
+
+    search_query2 = [
+        {"text_input": "A bus"},
+        {"image": "../misinformation/test/data/IMG_3758.png"},
+    ]
+
+    similarity, sorted_list = ms.MultimodalSearch.multimodal_search(
+        testdict,
+        model,
+        vis_processor,
+        txt_processor,
+        pre_model,
+        image_keys,
+        features_image_stacked,
+        search_query2,
+    )
+
+    for i, num in zip(range(12), similarity.tolist()):
+        for j, num2 in zip(range(len(num)), num):
+            assert (
+                math.isclose(num2, pre_simularity[i][j], rel_tol=100 * related_error)
+                is True
+            )
+
+    for i, num in zip(range(2), sorted_list):
+        for j, num2 in zip(range(2), num):
+            assert num2 == pre_sorted[i][j]
+
+    del model, vis_processor, txt_processor
+    cuda.empty_cache()
diff --git a/misinformation/test/test_objects.py b/misinformation/test/test_objects.py
index c73312a..21029ac 100644
--- a/misinformation/test/test_objects.py
+++ b/misinformation/test/test_objects.py
@@ -6,8 +6,8 @@ import misinformation.objects_cvlib as ob_cvlib
 OBJECT_1 = "cell phone"
 OBJECT_2 = "motorcycle"
 OBJECT_3 = "traffic light"
-TEST_IMAGE_1 = "./test/data/IMG_2809.png"
-JSON_1 = "./test/data/example_objects_cvlib.json"
+TEST_IMAGE_1 = "IMG_2809.png"
+JSON_1 = "example_objects_cvlib.json"
 
 
 @pytest.fixture()
@@ -25,11 +25,11 @@ def test_objects_from_cvlib(default_objects):
     assert str(objects) == str(out_objects)
 
 
-def test_analyse_image_cvlib():
-    mydict = {"filename": TEST_IMAGE_1}
+def test_analyse_image_cvlib(get_path):
+    mydict = {"filename": get_path + TEST_IMAGE_1}
     ob_cvlib.ObjectCVLib().analyse_image(mydict)
 
-    with open(JSON_1, "r") as file:
+    with open(get_path + JSON_1, "r") as file:
         out_dict = json.load(file)
     for key in mydict.keys():
         assert mydict[key] == out_dict[key]
@@ -54,37 +54,37 @@ def test_init_default_objects():
         assert init_objects[obj] == "no"
 
 
-def test_analyse_image_from_file_cvlib():
-    file_path = TEST_IMAGE_1
-    objs = ob_cvlib.ObjectCVLib().analyse_image_from_file(file_path)
+def test_analyse_image_from_file_cvlib(get_path):
+    file_path = get_path + TEST_IMAGE_1
+    objs = ob_cvlib.ObjectCVLib().analyse_image_from_file(get_path + file_path)
 
-    with open(JSON_1, "r") as file:
+    with open(get_path + JSON_1, "r") as file:
         out_dict = json.load(file)
     for key in objs.keys():
         assert objs[key] == out_dict[key]
 
 
-def test_detect_objects_cvlib():
-    file_path = TEST_IMAGE_1
+def test_detect_objects_cvlib(get_path):
+    file_path = get_path + TEST_IMAGE_1
     objs = ob_cvlib.ObjectCVLib().detect_objects_cvlib(file_path)
 
-    with open(JSON_1, "r") as file:
+    with open(get_path + JSON_1, "r") as file:
         out_dict = json.load(file)
     for key in objs.keys():
         assert objs[key] == out_dict[key]
 
 
-def test_set_keys(default_objects):
-    mydict = {"filename": TEST_IMAGE_1}
+def test_set_keys(default_objects, get_path):
+    mydict = {"filename": get_path + TEST_IMAGE_1}
     key_objs = ob.ObjectDetector(mydict).set_keys()
     assert str(default_objects) == str(key_objs)
 
 
-def test_analyse_image():
-    mydict = {"filename": TEST_IMAGE_1}
+def test_analyse_image(get_path):
+    mydict = {"filename": get_path + TEST_IMAGE_1}
     ob.ObjectDetector.set_client_to_cvlib()
     ob.ObjectDetector(mydict).analyse_image()
-    with open(JSON_1, "r") as file:
+    with open(get_path + JSON_1, "r") as file:
         out_dict = json.load(file)
 
     assert str(mydict) == str(out_dict)
diff --git a/misinformation/test/test_summary.py b/misinformation/test/test_summary.py
new file mode 100644
index 0000000..b92ce59
--- /dev/null
+++ b/misinformation/test/test_summary.py
@@ -0,0 +1,166 @@
+import os
+from torch import device, cuda
+from lavis.models import load_model_and_preprocess
+import misinformation.summary as sm
+
+images = [
+    "./test/data/d755771b-225e-432f-802e-fb8dc850fff7.png",
+    "./test/data/IMG_2746.png",
+    "./test/data/IMG_2750.png",
+    "./test/data/IMG_2805.png",
+    "./test/data/IMG_2806.png",
+    "./test/data/IMG_2807.png",
+    "./test/data/IMG_2808.png",
+    "./test/data/IMG_2809.png",
+    "./test/data/IMG_3755.jpg",
+    "./test/data/IMG_3756.jpg",
+    "./test/data/IMG_3757.jpg",
+    "./test/data/pic1.png",
+]
+
+
+def test_analyse_image():
+    mydict = {}
+    for img_path in images:
+        id_ = os.path.splitext(os.path.basename(img_path))[0]
+        mydict[id_] = {"filename": img_path}
+
+    for key in mydict:
+        mydict[key] = sm.SummaryDetector(mydict[key]).analyse_image()
+    keys = list(mydict.keys())
+    assert len(mydict) == 12
+    for key in keys:
+        assert len(mydict[key]["3_non-deterministic summary"]) == 3
+
+    const_image_summary_list = [
+        "a river running through a city next to tall buildings",
+        "a crowd of people standing on top of a tennis court",
+        "a crowd of people standing on top of a field",
+        "a room with a desk and a chair",
+        "a table with plastic containers on top of it",
+        "a view of a city with mountains in the background",
+        "a view of a city street from a window",
+        "a busy city street with cars and pedestrians",
+        "a close up of an open book with writing on it",
+        "a book that is open on a table",
+        "a yellow book with green lettering on it",
+        "a person running on a beach near a rock formation",
+    ]
+
+    for i in range(len(const_image_summary_list)):
+        assert mydict[keys[i]]["const_image_summary"] == const_image_summary_list[i]
+
+    del sm.SummaryDetector.summary_model, sm.SummaryDetector.summary_vis_processors
+    cuda.empty_cache()
+
+    summary_device = device("cuda" if cuda.is_available() else "cpu")
+    summary_model, summary_vis_processors, _ = load_model_and_preprocess(
+        name="blip_caption",
+        model_type="base_coco",
+        is_eval=True,
+        device=summary_device,
+    )
+
+    for key in mydict:
+        mydict[key] = sm.SummaryDetector(mydict[key]).analyse_image(
+            summary_model, summary_vis_processors
+        )
+    keys = list(mydict.keys())
+
+    assert len(mydict) == 12
+    for key in keys:
+        assert len(mydict[key]["3_non-deterministic summary"]) == 3
+
+    const_image_summary_list2 = [
+        "a river running through a city next to tall buildings",
+        "a crowd of people standing on top of a tennis court",
+        "a crowd of people standing on top of a field",
+        "a room with a desk and a chair",
+        "a table with plastic containers on top of it",
+        "a view of a city with mountains in the background",
+        "a view of a city street from a window",
+        "a busy city street with cars and pedestrians",
+        "a close up of an open book with writing on it",
+        "a book that is open on a table",
+        "a yellow book with green lettering on it",
+        "a person running on a beach near a rock formation",
+    ]
+
+    for i in range(len(const_image_summary_list2)):
+        assert mydict[keys[i]]["const_image_summary"] == const_image_summary_list2[i]
+
+    del summary_model, summary_vis_processors
+    cuda.empty_cache()
+
+    summary_model, summary_vis_processors, _ = load_model_and_preprocess(
+        name="blip_caption",
+        model_type="large_coco",
+        is_eval=True,
+        device=summary_device,
+    )
+
+    for key in mydict:
+        mydict[key] = sm.SummaryDetector(mydict[key]).analyse_image(
+            summary_model, summary_vis_processors
+        )
+    keys = list(mydict.keys())
+    assert len(mydict) == 12
+    for key in keys:
+        assert len(mydict[key]["3_non-deterministic summary"]) == 3
+
+    const_image_summary_list3 = [
+        "a river running through a town next to tall buildings",
+        "a crowd of people standing on top of a track",
+        "a group of people standing on top of a track",
+        "a desk and chair in a small room",
+        "a table that has some chairs on top of it",
+        "a view of a city from a window of a building",
+        "a view of a city from a window",
+        "a city street filled with lots of traffic",
+        "an open book with german text on it",
+        "a close up of a book on a table",
+        "a book with a green cover on a table",
+        "a person running on a beach near the ocean",
+    ]
+
+    for i in range(len(const_image_summary_list2)):
+        assert mydict[keys[i]]["const_image_summary"] == const_image_summary_list3[i]
+
+
+def test_analyse_questions():
+    mydict = {}
+    for img_path in images:
+        id_ = os.path.splitext(os.path.basename(img_path))[0]
+        mydict[id_] = {"filename": img_path}
+
+    list_of_questions = [
+        "How many persons on the picture?",
+        "What happends on the picture?",
+    ]
+    for key in mydict:
+        mydict[key] = sm.SummaryDetector(mydict[key]).analyse_questions(
+            list_of_questions
+        )
+
+    keys = list(mydict.keys())
+    assert len(mydict) == 12
+
+    list_of_questions_ans = [2, 100, "many", 0, 0, "none", "two", 5, 0, 0, 0, 1]
+
+    list_of_questions_ans2 = [
+        "flood",
+        "festival",
+        "people are flying kites",
+        "no one's home",
+        "chair is being moved",
+        "traffic jam",
+        "day time",
+        "traffic jam",
+        "nothing",
+        "nothing",
+        "nothing",
+        "running",
+    ]
+
+    for i in range(len(list_of_questions_ans)):
+        assert mydict[keys[i]][list_of_questions[1]] == str(list_of_questions_ans2[i])
diff --git a/misinformation/test/test_text.py b/misinformation/test/test_text.py
index 9e71349..b3c8675 100644
--- a/misinformation/test/test_text.py
+++ b/misinformation/test/test_text.py
@@ -2,31 +2,30 @@ import os
 import pytest
 import spacy
 import misinformation.text as tt
-import misinformation
-import pandas as pd
-
-TESTDICT = {
-    "IMG_3755": {
-        "filename": "./test/data/IMG_3755.jpg",
-    },
-    "IMG_3756": {
-        "filename": "./test/data/IMG_3756.jpg",
-    },
-    "IMG_3757": {
-        "filename": "./test/data/IMG_3757.jpg",
-    },
-}
-
-LANGUAGES = ["de", "om", "en"]
-
-os.environ[
-    "GOOGLE_APPLICATION_CREDENTIALS"
-] = "../data/seismic-bonfire-329406-412821a70264.json"
 
 
-def test_TextDetector():
-    for item in TESTDICT:
-        test_obj = tt.TextDetector(TESTDICT[item])
+@pytest.fixture
+def set_testdict(get_path):
+    testdict = {
+        "IMG_3755": {
+            "filename": get_path + "IMG_3755.jpg",
+        },
+        "IMG_3756": {
+            "filename": get_path + "IMG_3756.jpg",
+        },
+        "IMG_3757": {
+            "filename": get_path + "IMG_3757.jpg",
+        },
+    }
+    return testdict
+
+
+LANGUAGES = ["de", "en", "en"]
+
+
+def test_TextDetector(set_testdict):
+    for item in set_testdict:
+        test_obj = tt.TextDetector(set_testdict[item])
         assert test_obj.subdict["text"] is None
         assert test_obj.subdict["text_language"] is None
         assert test_obj.subdict["text_english"] is None
@@ -34,30 +33,30 @@ def test_TextDetector():
 
 
 @pytest.mark.gcv
-def test_analyse_image():
-    for item in TESTDICT:
-        test_obj = tt.TextDetector(TESTDICT[item])
+def test_analyse_image(set_testdict, set_environ):
+    for item in set_testdict:
+        test_obj = tt.TextDetector(set_testdict[item])
         test_obj.analyse_image()
-        test_obj = tt.TextDetector(TESTDICT[item], analyse_text=True)
+        test_obj = tt.TextDetector(set_testdict[item], analyse_text=True)
         test_obj.analyse_image()
 
 
 @pytest.mark.gcv
-def test_get_text_from_image():
-    for item in TESTDICT:
-        test_obj = tt.TextDetector(TESTDICT[item])
+def test_get_text_from_image(set_testdict, get_path, set_environ):
+    for item in set_testdict:
+        test_obj = tt.TextDetector(set_testdict[item])
         test_obj.get_text_from_image()
-        ref_file = "./test/data/text_" + item + ".txt"
+        ref_file = get_path + "text_" + item + ".txt"
         with open(ref_file, "r", encoding="utf8") as file:
             reference_text = file.read()
         assert test_obj.subdict["text"] == reference_text
 
 
-def test_translate_text():
-    for item, lang in zip(TESTDICT, LANGUAGES):
-        test_obj = tt.TextDetector(TESTDICT[item])
-        ref_file = "./test/data/text_" + item + ".txt"
-        trans_file = "./test/data/text_translated_" + item + ".txt"
+def test_translate_text(set_testdict, get_path):
+    for item, lang in zip(set_testdict, LANGUAGES):
+        test_obj = tt.TextDetector(set_testdict[item])
+        ref_file = get_path + "text_" + item + ".txt"
+        trans_file = get_path + "text_translated_" + item + ".txt"
         with open(ref_file, "r", encoding="utf8") as file:
             reference_text = file.read()
         with open(trans_file, "r", encoding="utf8") as file:
@@ -77,9 +76,9 @@ def test_remove_linebreaks():
     assert test_obj.subdict["text_english"] == "This is   another  test."
 
 
-def test_run_spacy():
-    test_obj = tt.TextDetector(TESTDICT["IMG_3755"], analyse_text=True)
-    ref_file = "./test/data/text_IMG_3755.txt"
+def test_run_spacy(set_testdict, get_path):
+    test_obj = tt.TextDetector(set_testdict["IMG_3755"], analyse_text=True)
+    ref_file = get_path + "text_IMG_3755.txt"
     with open(ref_file, "r") as file:
         reference_text = file.read()
     test_obj.subdict["text_english"] = reference_text
@@ -87,10 +86,10 @@ def test_run_spacy():
     assert isinstance(test_obj.doc, spacy.tokens.doc.Doc)
 
 
-def test_clean_text():
+def test_clean_text(set_testdict):
     nlp = spacy.load("en_core_web_md")
     doc = nlp("I like cats and fjejg")
-    test_obj = tt.TextDetector(TESTDICT["IMG_3755"])
+    test_obj = tt.TextDetector(set_testdict["IMG_3755"])
     test_obj.doc = doc
     test_obj.clean_text()
     result = "I like cats and"
@@ -117,30 +116,35 @@ def test_sentiment_analysis():
     assert test_obj.subdict["subjectivity"] == 0.6
 
 
-def test_PostprocessText():
+def test_PostprocessText(set_testdict, get_path):
     reference_dict = "THE\nALGEBRAIC\nEIGENVALUE\nPROBLEM\nDOM\nNVS TIO\nMINA\nMonographs\non Numerical Analysis\nJ.. H. WILKINSON"
     reference_df = "Mathematische Formelsammlung\nfür Ingenieure und Naturwissenschaftler\nMit zahlreichen Abbildungen und Rechenbeispielen\nund einer ausführlichen Integraltafel\n3., verbesserte Auflage"
-    obj = tt.PostprocessText(mydict=TESTDICT)
-    # make sure test works on windows where end-of-line character is \r\n
+    img_numbers = ["IMG_3755", "IMG_3756", "IMG_3757"]
+    for image_ref in img_numbers:
+        ref_file = get_path + "text_" + image_ref + ".txt"
+        with open(ref_file, "r") as file:
+            reference_text = file.read()
+        set_testdict[image_ref]["text_english"] = reference_text
+    obj = tt.PostprocessText(mydict=set_testdict)
     test_dict = obj.list_text_english[2].replace("\r", "")
     assert test_dict == reference_dict
-    for key in TESTDICT.keys():
-        TESTDICT[key].pop("text_english")
+    for key in set_testdict.keys():
+        set_testdict[key].pop("text_english")
     with pytest.raises(ValueError):
-        tt.PostprocessText(mydict=TESTDICT)
-    obj = tt.PostprocessText(use_csv=True, csv_path="./test/data/test_data_out.csv")
+        tt.PostprocessText(mydict=set_testdict)
+    obj = tt.PostprocessText(use_csv=True, csv_path=get_path + "test_data_out.csv")
     # make sure test works on windows where end-of-line character is \r\n
     test_df = obj.list_text_english[0].replace("\r", "")
     assert test_df == reference_df
     with pytest.raises(ValueError):
-        tt.PostprocessText(use_csv=True, csv_path="./test/data/test_data_out_nokey.csv")
+        tt.PostprocessText(use_csv=True, csv_path=get_path + "test_data_out_nokey.csv")
     with pytest.raises(ValueError):
         tt.PostprocessText()
 
 
-def test_analyse_topic():
+def test_analyse_topic(get_path):
     _, topic_df, most_frequent_topics = tt.PostprocessText(
-        use_csv=True, csv_path="./test/data/topic_analysis_test.csv"
+        use_csv=True, csv_path=get_path + "topic_analysis_test.csv"
     ).analyse_topic()
     # since this is not deterministic we cannot be sure we get the same result twice
     assert len(topic_df) == 2
diff --git a/misinformation/test/test_utils.py b/misinformation/test/test_utils.py
index 9479ed5..9529a54 100644
--- a/misinformation/test/test_utils.py
+++ b/misinformation/test/test_utils.py
@@ -3,38 +3,36 @@ import pandas as pd
 import misinformation.utils as ut
 
 
-def test_find_files():
-    result = ut.find_files(
-        path="./test/data/", pattern="*.png", recursive=True, limit=10
-    )
+def test_find_files(get_path):
+    result = ut.find_files(path=get_path, pattern="*.png", recursive=True, limit=10)
     assert len(result) > 0
 
 
-def test_initialize_dict():
+def test_initialize_dict(get_path):
     result = [
         "./test/data/image_faces.jpg",
         "./test/data/image_objects.jpg",
     ]
     mydict = ut.initialize_dict(result)
-    with open("./test/data/example_utils_init_dict.json", "r") as file:
+    with open(get_path + "example_utils_init_dict.json", "r") as file:
         out_dict = json.load(file)
     assert mydict == out_dict
 
 
-def test_append_data_to_dict():
-    with open("./test/data/example_append_data_to_dict_in.json", "r") as file:
+def test_append_data_to_dict(get_path):
+    with open(get_path + "example_append_data_to_dict_in.json", "r") as file:
         mydict = json.load(file)
     outdict = ut.append_data_to_dict(mydict)
     print(outdict)
-    with open("./test/data/example_append_data_to_dict_out.json", "r") as file:
+    with open(get_path + "example_append_data_to_dict_out.json", "r") as file:
         example_outdict = json.load(file)
 
     assert outdict == example_outdict
 
 
-def test_dump_df():
-    with open("./test/data/example_append_data_to_dict_out.json", "r") as file:
+def test_dump_df(get_path):
+    with open(get_path + "example_append_data_to_dict_out.json", "r") as file:
         outdict = json.load(file)
     df = ut.dump_df(outdict)
-    out_df = pd.read_csv("./test/data/example_dump_df.csv", index_col=[0])
+    out_df = pd.read_csv(get_path + "example_dump_df.csv", index_col=[0])
     pd.testing.assert_frame_equal(df, out_df)
diff --git a/notebooks/facial_expressions.ipynb b/notebooks/facial_expressions.ipynb
index dfd42f6..fe00584 100644
--- a/notebooks/facial_expressions.ipynb
+++ b/notebooks/facial_expressions.ipynb
@@ -46,7 +46,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import misinformation"
+    "import misinformation\n",
+    "from misinformation import utils as mutils\n",
+    "from misinformation import display as mdisplay"
    ]
   },
   {
@@ -64,7 +66,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "images = misinformation.find_files(\n",
+    "images = mutils.find_files(\n",
     "    path=\"drive/MyDrive/misinformation-data/\",\n",
     "    limit=1000,\n",
     ")"
@@ -85,7 +87,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "?misinformation.find_files"
+    "?mutils.find_files"
    ]
   },
   {
@@ -103,7 +105,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "mydict = misinformation.utils.initialize_dict(images)"
+    "mydict = mutils.initialize_dict(images[0:4])"
    ]
   },
   {
@@ -122,7 +124,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "misinformation.explore_analysis(mydict, identify=\"faces\")"
+    "mdisplay.explore_analysis(mydict, identify=\"faces\")"
    ]
   },
   {
@@ -159,8 +161,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "outdict = misinformation.utils.append_data_to_dict(mydict)\n",
-    "df = misinformation.utils.dump_df(outdict)"
+    "outdict = mutils.append_data_to_dict(mydict)\n",
+    "df = mutils.dump_df(outdict)"
    ]
   },
   {
@@ -210,7 +212,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "misinf",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -224,7 +226,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6 (main, Oct 24 2022, 16:07:47) [GCC 11.2.0]"
+   "version": "3.9.5"
   },
   "vscode": {
    "interpreter": {
diff --git a/notebooks/get-text-from-image.ipynb b/notebooks/get-text-from-image.ipynb
index 7666a88..0542220 100644
--- a/notebooks/get-text-from-image.ipynb
+++ b/notebooks/get-text-from-image.ipynb
@@ -42,6 +42,8 @@
     "import os\n",
     "from IPython.display import Image, display\n",
     "import misinformation\n",
+    "from misinformation import utils as mutils\n",
+    "from misinformation import display as mdisplay\n",
     "import tensorflow as tf\n",
     "\n",
     "print(tf.config.list_physical_devices(\"GPU\"))"
@@ -66,7 +68,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "images = misinformation.find_files(path=\"../data/all/\", limit=1000)"
+    "images = mutils.find_files(path=\"../data/all/\", limit=1000)"
    ]
   },
   {
@@ -87,7 +89,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "mydict = misinformation.utils.initialize_dict(images[0:3])"
+    "mydict = mutils.initialize_dict(images[0:3])"
    ]
   },
   {
@@ -126,7 +128,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "misinformation.explore_analysis(mydict, identify=\"text-on-image\")"
+    "mdisplay.explore_analysis(mydict, identify=\"text-on-image\")"
    ]
   },
   {
@@ -166,8 +168,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "outdict = misinformation.utils.append_data_to_dict(mydict)\n",
-    "df = misinformation.utils.dump_df(outdict)"
+    "outdict = mutils.append_data_to_dict(mydict)\n",
+    "df = mutils.dump_df(outdict)"
    ]
   },
   {
@@ -347,7 +349,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.9.5"
   },
   "vscode": {
    "interpreter": {
diff --git a/notebooks/image_summary.ipynb b/notebooks/image_summary.ipynb
new file mode 100644
index 0000000..d38bceb
--- /dev/null
+++ b/notebooks/image_summary.ipynb
@@ -0,0 +1,292 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Image summary and visual question answering"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebooks shows some preliminary work on Image Captioning and Visual question answering with lavis. It is mainly meant to explore its capabilities and to decide on future research directions. We package our code into a `misinformation` package that is imported here:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import misinformation\n",
+    "from misinformation import utils as mutils\n",
+    "from misinformation import display as mdisplay\n",
+    "import misinformation.summary as sm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set an image path as input file path."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "images = mutils.find_files(\n",
+    "    path=\"../misinformation/test/data/\",\n",
+    "    limit=1000,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mydict = mutils.initialize_dict(images[0:10])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mydict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create captions for images and directly write to csv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here you can choose between two models: \"base\" or \"large\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summary_model, summary_vis_processors = sm.SummaryDetector.load_model(mydict, \"base\")\n",
+    "# summary_model, summary_vis_processors = mutils.load_model(\"large\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for key in mydict:\n",
+    "    mydict[key] = sm.SummaryDetector(mydict[key]).analyse_image(\n",
+    "        summary_model, summary_vis_processors\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "Convert the dictionary of dictionarys into a dictionary with lists:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "outdict = mutils.append_data_to_dict(mydict)\n",
+    "df = mutils.dump_df(outdict)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Check the dataframe:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Write the csv file:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_csv(\"./data_out.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Manually inspect the summaries\n",
+    "\n",
+    "To check the analysis, you can inspect the analyzed elements here. Loading the results takes a moment, so please be patient. If you are sure of what you are doing.\n",
+    "\n",
+    "`const_image_summary` - the permanent summarys, which does not change from run to run (analyse_image).\n",
+    "\n",
+    "`3_non-deterministic summary` - 3 different summarys examples that change from run to run (analyse_image). "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mdisplay.explore_analysis(mydict, identify=\"summary\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generate answers to free-form questions about images written in natural language. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set the list of questions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_of_questions = [\n",
+    "    \"How many persons on the picture?\",\n",
+    "    \"Are there any politicians in the picture?\",\n",
+    "    \"Does the picture show something from medicine?\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for key in mydict:\n",
+    "    mydict[key] = sm.SummaryDetector(mydict[key]).analyse_questions(list_of_questions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mdisplay.explore_analysis(mydict, identify=\"summary\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Convert the dictionary of dictionarys into a dictionary with lists:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "outdict2 = mutils.append_data_to_dict(mydict)\n",
+    "df2 = mutils.dump_df(outdict2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2.to_csv(\"./data_out2.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.0"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "f1142466f556ab37fe2d38e2897a16796906208adb09fea90ba58bdf8a56f0ba"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/multimodal_search.ipynb b/notebooks/multimodal_search.ipynb
new file mode 100644
index 0000000..24664a7
--- /dev/null
+++ b/notebooks/multimodal_search.ipynb
@@ -0,0 +1,336 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "22df2297-0629-45aa-b88c-6c61f1544db6",
+   "metadata": {},
+   "source": [
+    "# Image Multimodal Search"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9eeeb302-296e-48dc-86c7-254aa02f2b3a",
+   "metadata": {},
+   "source": [
+    "This notebooks shows some preliminary work on Image Multimodal Search with lavis library. It is mainly meant to explore its capabilities and to decide on future research directions. We package our code into a `misinformation` package that is imported here:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f10ad6c9-b1a0-4043-8c5d-ed660d77be37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import misinformation\n",
+    "import misinformation.multimodal_search as ms"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "acf08b44-3ea6-44cd-926d-15c0fd9f39e0",
+   "metadata": {},
+   "source": [
+    "Set an image path as input file path."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d3fe589-ff3c-4575-b8f5-650db85596bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "images = misinformation.utils.find_files(\n",
+    "    path=\"../data/images/\",\n",
+    "    limit=1000,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "adf3db21-1f8b-4d44-bbef-ef0acf4623a0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mydict = misinformation.utils.initialize_dict(images)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d98b6227-886d-41b8-a377-896dd8ab3c2a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mydict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "987540a8-d800-4c70-a76b-7bfabaf123fa",
+   "metadata": {},
+   "source": [
+    "## Indexing and extracting features from images in selected folder"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "66d6ede4-00bc-4aeb-9a36-e52d7de33fe5",
+   "metadata": {},
+   "source": [
+    "You can choose one of the following models: blip, blip2, albef, clip_base, clip_vitl14, clip_vitl14_336"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bbca1f0-d4b0-43cd-8e05-ee39d37c328e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_type = \"blip\"\n",
+    "# model_type = \"blip2\"\n",
+    "# model_type = \"albef\"\n",
+    "# model_type = \"clip_base\"\n",
+    "# model_type = \"clip_vitl14\"\n",
+    "# model_type = \"clip_vitl14_336\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca095404-57d0-4f5d-aeb0-38c232252b17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "(\n",
+    "    model,\n",
+    "    vis_processors,\n",
+    "    txt_processors,\n",
+    "    image_keys,\n",
+    "    image_names,\n",
+    "    features_image_stacked,\n",
+    ") = ms.MultimodalSearch.parsing_images(mydict, model_type)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9ff8a894-566b-4c4f-acca-21c50b5b1f52",
+   "metadata": {},
+   "source": [
+    "The tensors of all images `features_image_stacked` was saved in `<Number_of_images>_<model_name>_saved_features_image.pt`. If you run it once for current model and current set of images you do not need to repeat it again. Instead you can load this features with the command:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56c6d488-f093-4661-835a-5c73a329c874",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# (\n",
+    "#    model,\n",
+    "#    vis_processors,\n",
+    "#    txt_processors,\n",
+    "#    image_keys,\n",
+    "#    image_names,\n",
+    "#    features_image_stacked,\n",
+    "# ) = ms.MultimodalSearch.parsing_images(mydict, model_type,\"18_clip_base_saved_features_image.pt\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "309923c1-d6f8-4424-8fca-bde5f3a98b38",
+   "metadata": {},
+   "source": [
+    "Here we already processed our image folder with 18 images with `clip_base` model. So you need just write the name `18_clip_base_saved_features_image.pt` of the saved file that consists of tensors of all images as a 3rd argument to the previous function. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "162a52e8-6652-4897-b92e-645cab07aaef",
+   "metadata": {},
+   "source": [
+    "Next, you need to form search queries. You can search either by image or by text. You can search for a single query, or you can search for several queries at once, the computational time should not be much different. The format of the queries is as follows:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c4196a52-d01e-42e4-8674-5712f7d6f792",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "search_query3 = [\n",
+    "    {\"text_input\": \"politician press conference\"},\n",
+    "    {\"text_input\": \"a world map\"},\n",
+    "    {\"image\": \"../data/haos.png\"},\n",
+    "    {\"image\": \"../data/image-34098-800.png\"},\n",
+    "    {\"image\": \"../data/LeonPresserMorocco20032015_600.png\"},\n",
+    "    {\"text_input\": \"a dog\"},\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8bcf3127-3dfd-4ff4-b9e7-a043099b1418",
+   "metadata": {},
+   "source": [
+    "You can filter your results in 3 different ways:\n",
+    "- `filter_number_of_images` limits the number of images found. That is, if the parameter `filter_number_of_images = 10`, then the first 10 images that best match the query will be shown. The other images ranks will be set to `None` and the similarity value to `0`.\n",
+    "- `filter_val_limit` limits the output of images with a similarity value not bigger than `filter_val_limit`. That is, if the parameter `filter_val_limit = 0.2`, all images with similarity less than 0.2 will be discarded.\n",
+    "- `filter_rel_error` (percentage) limits the output of images with a similarity value not bigger than `100 * abs(current_simularity_value - best_simularity_value_in_current_search)/best_simularity_value_in_current_search < filter_rel_error`. That is, if we set filter_rel_error = 30, it means that if the top1 image have 0.5 similarity value, we discard all image with similarity less than 0.35."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7f7dc52f-7ee9-4590-96b7-e0d9d3b82378",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "similarity = ms.MultimodalSearch.multimodal_search(\n",
+    "    mydict,\n",
+    "    model,\n",
+    "    vis_processors,\n",
+    "    txt_processors,\n",
+    "    model_type,\n",
+    "    image_keys,\n",
+    "    features_image_stacked,\n",
+    "    search_query3,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e1cf7e46-0c2c-4fb2-b89a-ef585ccb9339",
+   "metadata": {},
+   "source": [
+    "After launching `multimodal_search` function, the results of each query will be added to the source dictionary.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ad74b21-6187-4a58-9ed8-fd3e80f5a4ed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mydict[\"100127S_ara\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd3ee120-8561-482b-a76a-e8f996783325",
+   "metadata": {},
+   "source": [
+    "A special function was written to present the search results conveniently. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4324e4fd-e9aa-4933-bb12-074d54e0c510",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ms.MultimodalSearch.show_results(mydict, search_query3[4])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d86ab96b-1907-4b7f-a78e-3983b516d781",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Save searhing results to csv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4bdbc4d4-695d-4751-ab7c-d2d98e2917d7",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "Convert the dictionary of dictionarys into a dictionary with lists:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6c6ddd83-bc87-48f2-a8d6-1bd3f4201ff7",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "outdict = misinformation.utils.append_data_to_dict(mydict)\n",
+    "df = misinformation.utils.dump_df(outdict)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ea2675d5-604c-45e7-86d2-080b1f4559a0",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "Check the dataframe:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e78646d6-80be-4d3e-8123-3360957bcaa8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "05546d99-afab-4565-8f30-f14e1426abcf",
+   "metadata": {},
+   "source": [
+    "Write the csv file:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "185f7dde-20dc-44d8-9ab0-de41f9b5734d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_csv(\"./data_out.csv\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/objects_expression.ipynb b/notebooks/objects_expression.ipynb
index 405ee7c..4aa1431 100644
--- a/notebooks/objects_expression.ipynb
+++ b/notebooks/objects_expression.ipynb
@@ -21,6 +21,8 @@
    "outputs": [],
    "source": [
     "import misinformation\n",
+    "from misinformation import utils as mutils\n",
+    "from misinformation import display as mdisplay\n",
     "import misinformation.objects as ob"
    ]
   },
@@ -37,7 +39,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "images = misinformation.find_files(\n",
+    "images = mutils.find_files(\n",
     "    path=\"../data/images-little-text/\",\n",
     "    limit=1000,\n",
     ")"
@@ -49,7 +51,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "mydict = misinformation.utils.initialize_dict(images)"
+    "mydict = mutils.initialize_dict(images)"
    ]
   },
   {
@@ -91,8 +93,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "outdict = misinformation.utils.append_data_to_dict(mydict)\n",
-    "df = misinformation.utils.dump_df(outdict)"
+    "outdict = mutils.append_data_to_dict(mydict)\n",
+    "df = mutils.dump_df(outdict)"
    ]
   },
   {
@@ -142,7 +144,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "misinformation.explore_analysis(mydict, identify=\"objects\")"
+    "mdisplay.explore_analysis(mydict, identify=\"objects\")"
    ]
   },
   {
@@ -213,7 +215,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.0"
+   "version": "3.9.5"
   },
   "vscode": {
    "interpreter": {
diff --git a/pyproject.toml b/pyproject.toml
index 9a3fc1e..a276151 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,6 +12,7 @@ readme = "README.md"
 maintainers = [
     { name = "Inga Ulusoy", email = "ssc@iwr.uni-heidelberg.de" },
     { name = "Dominic Kempf", email = "ssc@iwr.uni-heidelberg.de" },
+    { name = "Petr Andriushchenko", email = "ssc@iwr.uni-heidelberg.de" },
 ]
 requires-python = ">=3.8"
 license = { text = "MIT" }
@@ -21,33 +22,34 @@ classifiers = [
     "License :: OSI Approved :: MIT License",
 ]
 dependencies = [
-    "google-cloud-vision",
+    "bertopic",
     "cvlib",
-    "deepface <= 0.0.75",
+    "deepface @ git+https://github.com/iulusoy/deepface.git",
+    "googletrans==3.1.0a0",
+    "grpcio",
+    "importlib_metadata",
+    "ipython",
     "ipywidgets",
+    "jupyterlab",
+    "matplotlib",
     "numpy<=1.23.4",
-    "opencv_python",
     "pandas",
+    "Pillow",
     "pooch",
     "protobuf",
-    "retina_face",
-    "setuptools",
-    "tensorflow",
-    "keras",
-    "openpyxl",
     "pytest",
     "pytest-cov",
-    "matplotlib",
-    "pytest",
-    "opencv-contrib-python",
-    "googletrans==3.1.0a0",
+    "requests",
+    "retina_face @ git+https://github.com/iulusoy/retinaface.git",
+    "salesforce-lavis @ git+https://github.com/iulusoy/LAVIS.git",
     "spacy",
-    "jupyterlab",
     "spacytextblob",
+    "tensorflow",
     "textblob",
-    "bertopic",
-    "grpcio",
-    "pandas",
+    "torch",
+    "google-cloud-vision",
+    "setuptools",
+    "opencv-contrib-python",
 ]
 
 [project.scripts]
diff --git a/requirements.txt b/requirements.txt
index 7abae00..6fe56b6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,6 +23,6 @@ jupyterlab
 spacytextblob
 textblob
 git+https://github.com/sloria/TextBlob.git@dev
+salesforce-lavis
 bertopic
 grpcio
-pandas
\ No newline at end of file