Merge pull request #84 from ssciwr/summary_multi_docs

Prepared functions documentation and fixed some code in multimodal search.
2025-10-29 21:16:06 +02:00 · 2023-05-26 11:22:50 +02:00 · 2023-05-26 11:22:50 +02:00 · 5b029b5c8b
--- a/ammico/multimodal_search.py
+++ b/ammico/multimodal_search.py
@ -20,7 +20,18 @@ class MultimodalSearch(AnalysisMethod):

    multimodal_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

-    def load_feature_extractor_model_blip2(self, device):
+    def load_feature_extractor_model_blip2(self, device: str = "cpu"):
+        """
+        Load pretrain blip2_feature_extractor model and preprocessors for visual and text inputs from lavis.models.
+
+        Args:
+            device (str): device to use. Can be "cpu" or "cuda". Default: "cpu".
+
+        Returns:
+            model (torch.nn.Module): model.
+            vis_processors (dict): preprocessors for visual inputs.
+            txt_processors (dict): preprocessors for text inputs.
+        """
        model, vis_processors, txt_processors = load_model_and_preprocess(
            name="blip2_feature_extractor",
            model_type="pretrain",
@ -29,7 +40,18 @@ class MultimodalSearch(AnalysisMethod):
        )
        return model, vis_processors, txt_processors

-    def load_feature_extractor_model_blip(self, device):
+    def load_feature_extractor_model_blip(self, device: str = "cpu"):
+        """
+        Load base blip_feature_extractor model and preprocessors for visual and text inputs from lavis.models.
+
+        Args:
+            device (str): device to use. Can be "cpu" or "cuda". Default: "cpu".
+
+        Returns:
+            model (torch.nn.Module): model.
+            vis_processors (dict): preprocessors for visual inputs.
+            txt_processors (dict): preprocessors for text inputs.
+        """
        model, vis_processors, txt_processors = load_model_and_preprocess(
            name="blip_feature_extractor",
            model_type="base",
@ -38,7 +60,18 @@ class MultimodalSearch(AnalysisMethod):
        )
        return model, vis_processors, txt_processors

-    def load_feature_extractor_model_albef(self, device):
+    def load_feature_extractor_model_albef(self, device: str = "cpu"):
+        """
+        Load base albef_feature_extractor model and preprocessors for visual and text inputs from lavis.models.
+
+        Args:
+            device (str): device to use. Can be "cpu" or "cuda". Default: "cpu".
+
+        Returns:
+            model (torch.nn.Module): model.
+            vis_processors (dict): preprocessors for visual inputs.
+            txt_processors (dict): preprocessors for text inputs.
+        """
        model, vis_processors, txt_processors = load_model_and_preprocess(
            name="albef_feature_extractor",
            model_type="base",
@ -47,7 +80,18 @@ class MultimodalSearch(AnalysisMethod):
        )
        return model, vis_processors, txt_processors

-    def load_feature_extractor_model_clip_base(self, device):
+    def load_feature_extractor_model_clip_base(self, device: str = "cpu"):
+        """
+        Load base clip_feature_extractor model and preprocessors for visual and text inputs from lavis.models.
+
+        Args:
+            device (str): device to use. Can be "cpu" or "cuda". Default: "cpu".
+
+        Returns:
+            model (torch.nn.Module): model.
+            vis_processors (dict): preprocessors for visual inputs.
+            txt_processors (dict): preprocessors for text inputs.
+        """
        model, vis_processors, txt_processors = load_model_and_preprocess(
            name="clip_feature_extractor",
            model_type="base",
@ -56,7 +100,18 @@ class MultimodalSearch(AnalysisMethod):
        )
        return model, vis_processors, txt_processors

-    def load_feature_extractor_model_clip_vitl14(self, device):
+    def load_feature_extractor_model_clip_vitl14(self, device: str = "cpu"):
+        """
+        Load ViT-L-14 clip_feature_extractor model and preprocessors for visual and text inputs from lavis.models.
+
+        Args:
+            device (str): device to use. Can be "cpu" or "cuda". Default: "cpu".
+
+        Returns:
+            model (torch.nn.Module): model.
+            vis_processors (dict): preprocessors for visual inputs.
+            txt_processors (dict): preprocessors for text inputs.
+        """
        model, vis_processors, txt_processors = load_model_and_preprocess(
            name="clip_feature_extractor",
            model_type="ViT-L-14",
@ -65,7 +120,18 @@ class MultimodalSearch(AnalysisMethod):
        )
        return model, vis_processors, txt_processors

-    def load_feature_extractor_model_clip_vitl14_336(self, device):
+    def load_feature_extractor_model_clip_vitl14_336(self, device: str = "cpu"):
+        """
+        Load ViT-L-14-336 clip_feature_extractor model and preprocessors for visual and text inputs from lavis.models.
+
+        Args:
+            device (str): device to use. Can be "cpu" or "cuda". Default: "cpu".
+
+        Returns:
+            model (torch.nn.Module): model.
+            vis_processors (dict): preprocessors for visual inputs.
+            txt_processors (dict): preprocessors for text inputs.
+        """
        model, vis_processors, txt_processors = load_model_and_preprocess(
            name="clip_feature_extractor",
            model_type="ViT-L-14-336",
@ -74,11 +140,31 @@ class MultimodalSearch(AnalysisMethod):
        )
        return model, vis_processors, txt_processors

-    def read_img(self, filepath):
+    def read_img(self, filepath: str) -> Image:
+        """
+        Load Image from filepath.
+
+        Args:
+            filepath (str): path to image.
+
+        Returns:
+            raw_image (PIL.Image): image.
+        """
        raw_image = Image.open(filepath).convert("RGB")
        return raw_image

-    def read_and_process_images(self, image_paths, vis_processor):
+    def read_and_process_images(self, image_paths: list, vis_processor) -> tuple:
+        """
+        Read and process images with vis_processor.
+
+        Args:
+            image_paths (str): paths to images.
+            vis_processor (dict): preprocessors for visual inputs.
+
+        Returns:
+            raw_images (list): list of images.
+            images_tensors (torch.Tensor): tensors of images stacked in device.
+        """
        raw_images = [MultimodalSearch.read_img(self, path) for path in image_paths]
        images = [
            vis_processor["eval"](r_img)
@ -90,7 +176,19 @@ class MultimodalSearch(AnalysisMethod):

        return raw_images, images_tensors

-    def extract_image_features_blip2(self, model, images_tensors):
+    def extract_image_features_blip2(
+        self, model, images_tensors: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Extract image features from images_tensors with blip2_feature_extractor model.
+
+        Args:
+            model (torch.nn.Module): model.
+            images_tensors (torch.Tensor): tensors of images stacked in device.
+
+        Returns:
+            features_image_stacked (torch.Tensor): tensors of images features stacked in device.
+        """
        with torch.cuda.amp.autocast(
            enabled=(MultimodalSearch.multimodal_device != torch.device("cpu"))
        ):
@ -103,7 +201,19 @@ class MultimodalSearch(AnalysisMethod):
            )
        return features_image_stacked

-    def extract_image_features_clip(self, model, images_tensors):
+    def extract_image_features_clip(
+        self, model, images_tensors: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Extract image features from images_tensors with clip_feature_extractor model.
+
+        Args:
+            model (torch.nn.Module): model.
+            images_tensors (torch.Tensor): tensors of images stacked in device.
+
+        Returns:
+            features_image_stacked (torch.Tensor): tensors of images features stacked in device.
+        """
        features_image = [
            model.extract_features({"image": ten}) for ten in images_tensors
        ]
@ -112,7 +222,19 @@ class MultimodalSearch(AnalysisMethod):
        )
        return features_image_stacked

-    def extract_image_features_basic(self, model, images_tensors):
+    def extract_image_features_basic(
+        self, model, images_tensors: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Extract image features from images_tensors with blip_feature_extractor or albef_feature_extractor model.
+
+        Args:
+            model (torch.nn.Module): model.
+            images_tensors (torch.Tensor): tensors of images stacked in device.
+
+        Returns:
+            features_image_stacked (torch.Tensor): tensors of images features stacked in device.
+        """
        features_image = [
            model.extract_features({"image": ten, "text_input": ""}, mode="image")
            for ten in images_tensors
@ -124,11 +246,23 @@ class MultimodalSearch(AnalysisMethod):

    def save_tensors(
        self,
-        model_type,
-        features_image_stacked,
-        name="saved_features_image.pt",
-        path="./saved_tensors/",
-    ):
+        model_type: str,
+        features_image_stacked: torch.Tensor,
+        name: str = "saved_features_image.pt",
+        path: str = "./saved_tensors/",
+    ) -> str:
+        """
+        Save tensors as binary to given path.
+
+        Args:
+            model_type (str): type of the model.
+            features_image_stacked (torch.Tensor): tensors of images features stacked in device.
+            name (str): name of the file. Default: "saved_features_image.pt".
+            path (str): path to save the file. Default: "./saved_tensors/".
+
+        Returns:
+            name (str): name of the file.
+        """
        if not os.path.exists(path):
            os.makedirs(path)
        with open(
@ -143,11 +277,30 @@ class MultimodalSearch(AnalysisMethod):
            torch.save(features_image_stacked, f)
        return name

-    def load_tensors(self, name):
+    def load_tensors(self, name: str) -> torch.Tensor:
+        """
+        Load tensors from given path.
+
+        Args:
+            name (str): name of the file.
+
+        Returns:
+            features_image_stacked (torch.Tensor): tensors of images features.
+        """
        features_image_stacked = torch.load(name)
        return features_image_stacked

-    def extract_text_features(self, model, text_input):
+    def extract_text_features(self, model, text_input: str) -> torch.Tensor:
+        """
+        Extract text features from text_input with feature_extractor model.
+
+        Args:
+            model (torch.nn.Module): model.
+            text_input (str): text.
+
+        Returns:
+            features_text (torch.Tensor): tensors of text features.
+        """
        sample_text = {"text_input": [text_input]}
        features_text = model.extract_features(sample_text, mode="text")

@ -155,18 +308,34 @@ class MultimodalSearch(AnalysisMethod):

    def parsing_images(
        self,
-        model_type,
-        path_to_saved_tensors="./saved_tensors/",
-        path_to_load_tensors=None,
-    ):
+        model_type: str,
+        path_to_save_tensors: str = "./saved_tensors/",
+        path_to_load_tensors: str = None,
+    ) -> tuple:
+        """
+        Parsing images with feature_extractor model.
+
+        Args:
+            model_type (str): type of the model.
+            path_to_save_tensors (str): path to save the tensors. Default: "./saved_tensors/".
+            path_to_load_tensors (str): path to load the tesors. Default: None.
+
+        Returns:
+            model (torch.nn.Module): model.
+            vis_processors (dict): preprocessors for visual inputs.
+            txt_processors (dict): preprocessors for text inputs.
+            image_keys (list): sorted list of image keys.
+            image_names (list): sorted list of image names.
+            features_image_stacked (torch.Tensor): tensors of images features stacked in device.
+        """
        if model_type in ("clip_base", "clip_vitl14_336", "clip_vitl14"):
            path_to_lib = lavis.__file__[:-11] + "models/clip_models/"
            url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz"
            r = requests.get(url, allow_redirects=False)
            open(path_to_lib + "bpe_simple_vocab_16e6.txt.gz", "wb").write(r.content)

-        image_keys = sorted(self.keys())
-        image_names = [self[k]["filename"] for k in image_keys]
+        image_keys = sorted(self.subdict.keys())
+        image_names = [self.subdict[k]["filename"] for k in image_keys]

        select_model = {
            "blip2": MultimodalSearch.load_feature_extractor_model_blip2,
@ -208,7 +377,7 @@ class MultimodalSearch(AnalysisMethod):
                    self, model, images_tensors
                )
            MultimodalSearch.save_tensors(
-                self, model_type, features_image_stacked, path=path_to_saved_tensors
+                self, model_type, features_image_stacked, path=path_to_save_tensors
            )
        else:
            features_image_stacked = MultimodalSearch.load_tensors(
@ -225,8 +394,21 @@ class MultimodalSearch(AnalysisMethod):
        )

    def querys_processing(
-        self, search_query, model, txt_processors, vis_processors, model_type
-    ):
+        self, search_query: list, model, txt_processors, vis_processors, model_type: str
+    ) -> torch.Tensor:
+        """
+        Process querys.
+
+        Args:
+            search_query (list): list of querys.
+            model (torch.nn.Module): model.
+            txt_processors (dict): preprocessors for text inputs.
+            vis_processors (dict): preprocessors for visual inputs.
+            model_type (str): type of the model.
+
+        Returns:
+            multi_features_stacked (torch.Tensor): tensors of querys features.
+        """
        select_extract_image_features = {
            "blip2": MultimodalSearch.extract_image_features_blip2,
            "blip": MultimodalSearch.extract_image_features_basic,
@ -295,16 +477,35 @@ class MultimodalSearch(AnalysisMethod):
        model,
        vis_processors,
        txt_processors,
-        model_type,
-        image_keys,
-        features_image_stacked,
-        search_query,
-        filter_number_of_images=None,
-        filter_val_limit=None,
-        filter_rel_error=None,
-    ):
+        model_type: str,
+        image_keys: list,
+        features_image_stacked: torch.Tensor,
+        search_query: list,
+        filter_number_of_images: str = None,
+        filter_val_limit: str = None,
+        filter_rel_error: str = None,
+    ) -> tuple:
+        """
+        Search for images with given querys.
+
+        Args:
+            model (torch.nn.Module): model.
+            vis_processors (dict): preprocessors for visual inputs.
+            txt_processors (dict): preprocessors for text inputs.
+            model_type (str): type of the model.
+            image_keys (list): sorted list of image keys.
+            features_image_stacked (torch.Tensor): tensors of images features stacked in device.
+            search_query (list): list of querys.
+            filter_number_of_images (str): number of images to show. Default: None.
+            filter_val_limit (str): limit of similarity value. Default: None.
+            filter_rel_error (str): limit of relative error. Default: None.
+
+        Returns:
+            similarity (torch.Tensor): similarity between images and querys.
+            sorted_lists (list): sorted list of similarity.
+        """
        if filter_number_of_images is None:
-            filter_number_of_images = len(self)
+            filter_number_of_images = len(self.subdict)
        if filter_val_limit is None:
            filter_val_limit = 0
        if filter_rel_error is None:
@ -330,20 +531,29 @@ class MultimodalSearch(AnalysisMethod):
                    and 100 * abs(max_val - similarity[key][q].item()) / max_val
                    < filter_rel_error
                ):
-                    self[image_keys[key]][
+                    self.subdict[image_keys[key]][
                        "rank " + list(search_query[q].values())[0]
                    ] = places[q][key]
-                    self[image_keys[key]][
+                    self.subdict[image_keys[key]][
                        list(search_query[q].values())[0]
                    ] = similarity[key][q].item()
                else:
-                    self[image_keys[key]][
+                    self.subdict[image_keys[key]][
                        "rank " + list(search_query[q].values())[0]
                    ] = None
-                    self[image_keys[key]][list(search_query[q].values())[0]] = 0
+                    self.subdict[image_keys[key]][list(search_query[q].values())[0]] = 0
        return similarity, sorted_lists

-    def itm_text_precessing(self, search_query):
+    def itm_text_precessing(self, search_query: list[dict[str, str]]) -> list:
+        """
+        Process text querys for itm model.
+
+        Args:
+            search_query (list): list of querys.
+
+        Returns:
+            text_query_index (list): list of indexes of text querys.
+        """
        for query in search_query:
            if (len(query) != 1) and (query in ("image", "text_input")):
                raise SyntaxError(
@ -356,11 +566,23 @@ class MultimodalSearch(AnalysisMethod):

        return text_query_index

-    def get_pathes_from_query(self, query):
+    def get_pathes_from_query(self, query: dict[str, str]) -> tuple:
+        """
+        Get pathes and image names from query.
+
+        Args:
+            query (dict): query.
+
+        Returns:
+            paths (list): list of pathes.
+            image_names (list): list of image names.
+        """
        paths = []
        image_names = []
        for s in sorted(
-            self.items(), key=lambda t: t[1][list(query.values())[0]], reverse=True
+            self.subdict.items(),
+            key=lambda t: t[1][list(query.values())[0]],
+            reverse=True,
        ):
            if s[1]["rank " + list(query.values())[0]] is None:
                break
@ -368,7 +590,18 @@ class MultimodalSearch(AnalysisMethod):
            image_names.append(s[0])
        return paths, image_names

-    def read_and_process_images_itm(self, image_paths, vis_processor):
+    def read_and_process_images_itm(self, image_paths: list, vis_processor) -> tuple:
+        """
+        Read and process images with vis_processor for itm model.
+
+        Args:
+            image_paths (list): paths to images.
+            vis_processor (dict): preprocessors for visual inputs.
+
+        Returns:
+            raw_images (list): list of images.
+            images_tensors (torch.Tensor): tensors of images stacked in device.
+        """
        raw_images = [MultimodalSearch.read_img(self, path) for path in image_paths]
        images = [vis_processor(r_img) for r_img in raw_images]
        images_tensors = torch.stack(images).to(MultimodalSearch.multimodal_device)
@ -377,12 +610,26 @@ class MultimodalSearch(AnalysisMethod):

    def compute_gradcam_batch(
        self,
-        model,
-        visual_input,
-        text_input,
-        tokenized_text,
-        block_num=6,
-    ):
+        model: torch.nn.Module,
+        visual_input: torch.Tensor,
+        text_input: str,
+        tokenized_text: torch.Tensor,
+        block_num: str = 6,
+    ) -> tuple:
+        """
+        Compute gradcam for itm model.
+
+        Args:
+            model (torch.nn.Module): model.
+            visual_input (torch.Tensor): tensors of images features stacked in device.
+            text_input (str): text.
+            tokenized_text (torch.Tensor): tokenized text.
+            block_num (int): number of block. Default: 6.
+
+        Returns:
+            gradcam (torch.Tensor): gradcam.
+            output (torch.Tensor): output of model.
+        """
        model.text_encoder.base_model.base_model.encoder.layer[
            block_num
        ].crossattention.self.save_attention = True
@ -430,7 +677,16 @@ class MultimodalSearch(AnalysisMethod):

        return gradcam, output

-    def resize_img(self, raw_img):
+    def resize_img(self, raw_img: Image):
+        """
+        Proportional resize image to 240 p width.
+
+        Args:
+            raw_img (PIL.Image): image.
+
+        Returns:
+            resized_image (PIL.Image): proportional resized image to 240p.
+        """
        w, h = raw_img.size
        scaling_factor = 240 / w
        resized_image = raw_img.resize(
@ -438,7 +694,25 @@ class MultimodalSearch(AnalysisMethod):
        )
        return resized_image

-    def get_att_map(self, img, att_map, blur=True, overlap=True):
+    def get_att_map(
+        self,
+        img: np.ndarray,
+        att_map: np.ndarray,
+        blur: bool = True,
+        overlap: bool = True,
+    ) -> np.ndarray:
+        """
+        Get attention map.
+
+        Args:
+            img (np.ndarray): image.
+            att_map (np.ndarray): attention map.
+            blur (bool): blur attention map. Default: True.
+            overlap (bool): overlap attention map with image. Default: True.
+
+        Returns:
+            att_map (np.ndarray): attention map.
+        """
        att_map -= att_map.min()
        if att_map.max() > 0:
            att_map /= att_map.max()
@ -459,7 +733,14 @@ class MultimodalSearch(AnalysisMethod):
            )
        return att_map

-    def upload_model_blip2_coco(self):
+    def upload_model_blip2_coco(self) -> tuple:
+        """
+        Load coco blip2_image_text_matching model and preprocessors for visual inputs from lavis.models.
+
+        Returns:
+            itm_model (torch.nn.Module): model.
+            vis_processor (dict): preprocessors for visual inputs.
+        """
        itm_model = load_model(
            "blip2_image_text_matching",
            "coco",
@ -469,7 +750,14 @@ class MultimodalSearch(AnalysisMethod):
        vis_processor = load_processor("blip_image_eval").build(image_size=364)
        return itm_model, vis_processor

-    def upload_model_blip_base(self):
+    def upload_model_blip_base(self) -> tuple:
+        """
+        Load base blip_image_text_matching model and preprocessors for visual input from lavis.models.
+
+        Returns:
+            itm_model (torch.nn.Module): model.
+            vis_processor (dict): preprocessors for visual inputs.
+        """
        itm_model = load_model(
            "blip_image_text_matching",
            "base",
@ -479,7 +767,14 @@ class MultimodalSearch(AnalysisMethod):
        vis_processor = load_processor("blip_image_eval").build(image_size=384)
        return itm_model, vis_processor

-    def upload_model_blip_large(self):
+    def upload_model_blip_large(self) -> tuple:
+        """
+        Load large blip_image_text_matching model and preprocessors for visual input from lavis.models.
+
+        Returns:
+            itm_model (torch.nn.Module): model.
+            vis_processor (dict): preprocessors for visual inputs.
+        """
        itm_model = load_model(
            "blip_image_text_matching",
            "large",
@ -491,13 +786,28 @@ class MultimodalSearch(AnalysisMethod):

    def image_text_match_reordering(
        self,
-        search_query,
-        itm_model_type,
-        image_keys,
-        sorted_lists,
-        batch_size=1,
-        need_grad_cam=False,
-    ):
+        search_query: list[dict[str, str]],
+        itm_model_type: str,
+        image_keys: list,
+        sorted_lists: list[list],
+        batch_size: int = 1,
+        need_grad_cam: bool = False,
+    ) -> tuple:
+        """
+        Reorder images with itm model.
+
+        Args:
+            search_query (list): list of querys.
+            itm_model_type (str): type of the model.
+            image_keys (list): sorted list of image keys.
+            sorted_lists (list): sorted list of similarity.
+            batch_size (int): batch size. Default: 1.
+            need_grad_cam (bool): need gradcam. Default: False. blip2_coco model does not yet work with gradcam.
+
+        Returns:
+            itm_scores2: list of itm scores.
+            image_gradcam_with_itm: dict of image names and gradcam.
+        """
        if itm_model_type == "blip2_coco" and need_grad_cam is True:
            raise SyntaxError(
                "The blip2_coco model does not yet work with gradcam. Please set need_grad_cam to False"
@ -588,17 +898,17 @@ class MultimodalSearch(AnalysisMethod):
            }
            for i, key in zip(range(len(image_keys)), sorted_lists[index_text_query]):
                if image_keys[key] in image_names:
-                    self[image_keys[key]][
+                    self.subdict[image_keys[key]][
                        "itm " + list(search_query[index_text_query].values())[0]
                    ] = image_names_with_itm[image_keys[key]]
-                    self[image_keys[key]][
+                    self.subdict[image_keys[key]][
                        "itm_rank " + list(search_query[index_text_query].values())[0]
                    ] = image_names_with_new_rank[image_keys[key]]
                else:
-                    self[image_keys[key]][
+                    self.subdict[image_keys[key]][
                        "itm " + list(search_query[index_text_query].values())[0]
                    ] = 0
-                    self[image_keys[key]][
+                    self.subdict[image_keys[key]][
                        "itm_rank " + list(search_query[index_text_query].values())[0]
                    ] = None

@ -624,7 +934,17 @@ class MultimodalSearch(AnalysisMethod):
        torch.cuda.empty_cache()
        return itm_scores2, image_gradcam_with_itm

-    def show_results(self, query, itm=False, image_gradcam_with_itm=False):
+    def show_results(
+        self, query: dict, itm=False, image_gradcam_with_itm=False
+    ) -> None:
+        """
+        Show results of search.
+
+        Args:
+            query (dict): query.
+            itm (bool): use itm model. Default: False.
+            image_gradcam_with_itm (bool): use gradcam. Default: False.
+        """
        if "image" in query.keys():
            pic = Image.open(query["image"]).convert("RGB")
            pic.thumbnail((400, 400))
@ -648,7 +968,7 @@ class MultimodalSearch(AnalysisMethod):
            current_querry_rank = "rank " + list(query.values())[0]

        for s in sorted(
-            self.items(), key=lambda t: t[1][current_querry_val], reverse=True
+            self.subdict.items(), key=lambda t: t[1][current_querry_val], reverse=True
        ):
            if s[1][current_querry_rank] is None:
                break
--- a/ammico/summary.py
+++ b/ammico/summary.py
@ -7,9 +7,18 @@ from lavis.models import load_model_and_preprocess
 class SummaryDetector(AnalysisMethod):
    def __init__(self, subdict: dict) -> None:
        super().__init__(subdict)
-        self.summary_device = device("cuda" if cuda.is_available() else "cpu")
+        self.summary_device = "cuda" if cuda.is_available() else "cpu"

    def load_model_base(self):
+        """
+        Load base_coco blip_caption model and preprocessors for visual inputs from lavis.models.
+
+        Args:
+
+        Returns:
+            model (torch.nn.Module): model.
+            vis_processors (dict): preprocessors for visual inputs.
+        """
        summary_model, summary_vis_processors, _ = load_model_and_preprocess(
            name="blip_caption",
            model_type="base_coco",
@ -19,6 +28,15 @@ class SummaryDetector(AnalysisMethod):
        return summary_model, summary_vis_processors

    def load_model_large(self):
+        """
+        Load large_coco blip_caption model and preprocessors for visual inputs from lavis.models.
+
+        Args:
+
+        Returns:
+            model (torch.nn.Module): model.
+            vis_processors (dict): preprocessors for visual inputs.
+        """
        summary_model, summary_vis_processors, _ = load_model_and_preprocess(
            name="blip_caption",
            model_type="large_coco",
@ -27,7 +45,17 @@ class SummaryDetector(AnalysisMethod):
        )
        return summary_model, summary_vis_processors

-    def load_model(self, model_type):
+    def load_model(self, model_type: str):
+        """
+        Load blip_caption model and preprocessors for visual inputs from lavis.models.
+
+        Args:
+            model_type (str): type of the model.
+
+        Returns:
+            model (torch.nn.Module): model.
+            vis_processors (dict): preprocessors for visual inputs.
+        """
        select_model = {
            "base": SummaryDetector.load_model_base,
            "large": SummaryDetector.load_model_large,
@ -36,6 +64,16 @@ class SummaryDetector(AnalysisMethod):
        return summary_model, summary_vis_processors

    def analyse_image(self, summary_model=None, summary_vis_processors=None):
+        """
+        Create 1 constant and 3 non deterministic captions for image.
+
+        Args:
+            summary_model (str): model.
+            summary_vis_processors (str): preprocessors for visual inputs.
+
+        Returns:
+            self.subdict (dict): dictionary with constant image summary and 3 non deterministic summary.
+        """
        if summary_model is None and summary_vis_processors is None:
            summary_model, summary_vis_processors = self.load_model_base()

@ -55,7 +93,16 @@ class SummaryDetector(AnalysisMethod):
            )
        return self.subdict

-    def analyse_questions(self, list_of_questions):
+    def analyse_questions(self, list_of_questions: list[str]) -> dict:
+        """
+        Generate answers to free-form questions about image written in natural language.
+
+        Args:
+            list_of_questions (list[str]): list of questions.
+
+        Returns:
+            self.subdict (dict): dictionary with answers to questions.
+        """
        (
            summary_vqa_model,
            summary_vqa_vis_processors,
--- a/ammico/test/test_multimodal_search.py
+++ b/ammico/test/test_multimodal_search.py
@ -354,6 +354,7 @@ def test_parsing_images(
    tmp_path,
 ):
    ms.MultimodalSearch.multimodal_device = pre_multimodal_device
+    my_obj = ms.MultimodalSearch(get_testdict)
    (
        model,
        vis_processor,
@ -361,9 +362,7 @@ def test_parsing_images(
        image_keys,
        _,
        features_image_stacked,
-    ) = ms.MultimodalSearch.parsing_images(
-        get_testdict, pre_model, path_to_saved_tensors=tmp_path
-    )
+    ) = my_obj.parsing_images(pre_model, path_to_save_tensors=tmp_path)

    for i, num in zip(range(10), features_image_stacked[0, 10:12].tolist()):
        assert (
@ -371,7 +370,7 @@ def test_parsing_images(
            is True
        )

-    test_pic = Image.open(get_testdict["IMG_2746"]["filename"]).convert("RGB")
+    test_pic = Image.open(my_obj.subdict["IMG_2746"]["filename"]).convert("RGB")
    test_querry = (
        "The bird sat on a tree located at the intersection of 23rd and 43rd streets."
    )
@ -387,10 +386,10 @@ def test_parsing_images(

    search_query = [
        {"text_input": test_querry},
-        {"image": get_testdict["IMG_2746"]["filename"]},
+        {"image": my_obj.subdict["IMG_2746"]["filename"]},
    ]
-    multi_features_stacked = ms.MultimodalSearch.querys_processing(
-        get_testdict, search_query, model, txt_processor, vis_processor, pre_model
+    multi_features_stacked = my_obj.querys_processing(
+        search_query, model, txt_processor, vis_processor, pre_model
    )

    for i, num in zip(range(10), multi_features_stacked[0, 10:12].tolist()):
@ -410,8 +409,7 @@ def test_parsing_images(
        {"image": get_path + "IMG_3758.png"},
    ]

-    similarity, sorted_list = ms.MultimodalSearch.multimodal_search(
-        get_testdict,
+    similarity, sorted_list = my_obj.multimodal_search(
        model,
        vis_processor,
        txt_processor,
@ -440,6 +438,7 @@ def test_parsing_images(
        features_image_stacked,
        processed_pic,
        multi_features_stacked,
+        my_obj,
    )
    cuda.empty_cache()

@ -452,12 +451,12 @@ def test_itm(get_test_my_dict, get_path):
    ]
    image_keys = ["IMG_2746", "IMG_2809"]
    sorted_list = [[1, 0], [1, 0]]
+    my_obj = ms.MultimodalSearch(get_test_my_dict)
    for itm_model in ["blip_base", "blip_large"]:
        (
            itm_scores,
            image_gradcam_with_itm,
-        ) = ms.MultimodalSearch.image_text_match_reordering(
-            get_test_my_dict,
+        ) = my_obj.image_text_match_reordering(
            search_query3,
            itm_model,
            image_keys,
@ -497,12 +496,12 @@ def test_itm_blip2_coco(get_test_my_dict, get_path):
    ]
    image_keys = ["IMG_2746", "IMG_2809"]
    sorted_list = [[1, 0], [1, 0]]
+    my_obj = ms.MultimodalSearch(get_test_my_dict)

    (
        itm_scores,
        image_gradcam_with_itm,
-    ) = ms.MultimodalSearch.image_text_match_reordering(
-        get_test_my_dict,
+    ) = my_obj.image_text_match_reordering(
        search_query3,
        "blip2_coco",
        image_keys,
--- a/ammico/text.py
+++ b/ammico/text.py
@ -1,4 +1,5 @@
 from google.cloud import vision
+from google.auth.exceptions import DefaultCredentialsError
 from googletrans import Translator
 import spacy
 from spacytextblob.spacytextblob import SpacyTextBlob
@ -60,7 +61,12 @@ class TextDetector(utils.AnalysisMethod):
    def get_text_from_image(self):
        """Detects text on the image."""
        path = self.subdict["filename"]
-        client = vision.ImageAnnotatorClient()
+        try:
+            client = vision.ImageAnnotatorClient()
+        except DefaultCredentialsError:
+            raise DefaultCredentialsError(
+                "Please provide credentials for google cloud vision API, see https://cloud.google.com/docs/authentication/application-default-credentials."
+            )
        with io.open(path, "rb") as image_file:
            content = image_file.read()
        image = vision.Image(content=content)
--- a/ammico/utils.py
+++ b/ammico/utils.py
@ -45,7 +45,7 @@ class AnalysisMethod:


 def find_files(path=None, pattern="*.png", recursive=True, limit=20):
-    """Find image files on the file system
+    """Find image files on the file system.

    :param path:
        The base directory where we are looking for the images. Defaults
--- a/docs/source/misinformation.rst
+++ b/docs/source/misinformation.rst
@ -50,6 +50,22 @@ cropposts module
 ----------------

 .. automodule:: cropposts
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+multimodal search module
+------------------------
+
+.. automodule:: multimodal_search
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+summary module
+--------------
+
+.. automodule:: summary
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/modules.rst
+++ b/docs/source/modules.rst
@ -4,4 +4,4 @@ AMMICO package modules
 .. toctree::
   :maxdepth: 4

-   misinformation
+   ammico
--- a/docs/source/notebooks/Example
+++ b/docs/source/notebooks/Example
@ -1,6 +1,7 @@
 {
 "cells": [
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "d2c4d40d-8aca-4024-8d19-a65c4efe825d",
   "metadata": {},
@ -14,16 +15,41 @@
   "id": "51f8888b-d1a3-4b85-a596-95c0993fa192",
   "metadata": {},
   "source": [
-    "This notebooks shows some preliminary work on detecting facial expressions with DeepFace. It is mainly meant to explore its capabilities and to decide on future research directions. We package our code into a `ammico` package that is imported here:"
+    "Facial expressions can be detected using [DeepFace](https://github.com/serengil/deepface) and [RetinaFace](https://github.com/serengil/retinaface).\n",
+    "\n",
+    "The first cell is only run on google colab and installs the [ammico](https://github.com/ssciwr/AMMICO) package.\n",
+    "\n",
+    "After that, we can import `ammico` and read in the files given a folder path."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "50c1c1c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# if running on google colab\n",
+    "# flake8-noqa-cell\n",
+    "import os\n",
+    "\n",
+    "if \"google.colab\" in str(get_ipython()):\n",
+    "    # update python version\n",
+    "    # install setuptools\n",
+    "    # %pip install setuptools==61 -qqq\n",
+    "    # install ammico\n",
+    "    %pip install git+https://github.com/ssciwr/ammico.git -qqq\n",
+    "    # mount google drive for data and API key\n",
+    "    from google.colab import drive\n",
+    "\n",
+    "    drive.mount(\"/content/drive\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b21e52a5-d379-42db-aae6-f2ab9ed9a369",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "import ammico\n",
@ -32,22 +58,23 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "a2bd2153",
   "metadata": {},
   "source": [
-    "We select a subset of image files to try facial expression detection on. The `find_files` function finds image files within a given directory:"
+    "We select a subset of image files to try facial expression detection on, see the `limit` keyword. The `find_files` function finds image files within a given directory:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "afe7e638-f09d-47e7-9295-1c374bd64c53",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
+    "# Here you need to provide the path to your google drive folder\n",
+    "# or local folder containing the images\n",
    "images = mutils.find_files(\n",
    "    path=\"data/\",\n",
    "    limit=10,\n",
@ -55,24 +82,7 @@
   ]
  },
  {
-   "cell_type": "markdown",
-   "id": "e149bfe5-90b0-49b2-af3d-688e41aab019",
-   "metadata": {},
-   "source": [
-    "If you want to fine tune the discovery of image files, you can provide more parameters:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f38bb8ed-1004-4e33-8ed6-793cb5869400",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "?mutils.find_files"
-   ]
-  },
-  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "705e7328",
   "metadata": {},
@ -84,50 +94,47 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "b37c0c91",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "mydict = mutils.initialize_dict(images)"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "a9372561",
   "metadata": {},
   "source": [
    "To check the analysis, you can inspect the analyzed elements here. Loading the results takes a moment, so please be patient. If you are sure of what you are doing, you can skip this and directly export a csv file in the step below.\n",
-    "Here, we display the face recognition results provided by the DeepFace library. Click on the tabs to see the results in the right sidebar:"
+    "Here, we display the face recognition results provided by the DeepFace and RetinaFace libraries. Click on the tabs to see the results in the right sidebar. You may need to increment the `port` number if you are already running several notebook instances on the same server."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "992499ed-33f1-4425-ad5d-738cf565d175",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
-    "mdisplay.explore_analysis(mydict, identify=\"faces\")"
+    "analysis_explorer = mdisplay.AnalysisExplorer(mydict, identify=\"faces\")\n",
+    "analysis_explorer.run_server(port = 8050)"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "6f974341",
   "metadata": {},
   "source": [
-    "Directly carry out the analysis and export the result into a csv: Analysis - "
+    "Instead of inspecting each of the images, you can also directly carry out the analysis and export the result into a csv. This may take a while depending on how many images you have loaded."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f97c7d0",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "for key in mydict.keys():\n",
@ -135,20 +142,19 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "174357b1",
   "metadata": {},
   "source": [
-    "Convert the dictionary of dictionarys into a dictionary with lists:"
+    "These steps are required to convert the dictionary of dictionarys into a dictionary with lists, that can be converted into a pandas dataframe and exported to a csv file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "604bd257",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "outdict = mutils.append_data_to_dict(mydict)\n",
@ -156,6 +162,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "8373d9f8",
   "metadata": {},
@ -167,32 +174,29 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "aa4b518a",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "df.head(10)"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "579cd59f",
   "metadata": {},
   "source": [
-    "Write the csv file:"
+    "Write the csv file - here you should provide a file path and file name for the csv file to be written."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4618decb",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
-    "df.to_csv(\"data/data_out.csv\")"
+    "df.to_csv(\"data_out.csv\")"
   ]
  },
  {
@ -206,7 +210,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
--- a/docs/source/notebooks/Example
+++ b/docs/source/notebooks/Example
@ -1,6 +1,7 @@
 {
 "cells": [
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "22df2297-0629-45aa-b88c-6c61f1544db6",
   "metadata": {},
@ -14,7 +15,34 @@
   "id": "9eeeb302-296e-48dc-86c7-254aa02f2b3a",
   "metadata": {},
   "source": [
-    "This notebooks shows some preliminary work on Image Multimodal Search with lavis library. It is mainly meant to explore its capabilities and to decide on future research directions. We package our code into a `ammico` package that is imported here:"
+    "This notebooks shows how to carry out an image multimodal search with the [LAVIS](https://github.com/salesforce/LAVIS) library. \n",
+    "\n",
+    "The first cell is only run on google colab and installs the [ammico](https://github.com/ssciwr/AMMICO) package.\n",
+    "\n",
+    "After that, we can import `ammico` and read in the files given a folder path."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b0a6bdf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# if running on google colab\n",
+    "# flake8-noqa-cell\n",
+    "import os\n",
+    "\n",
+    "if \"google.colab\" in str(get_ipython()):\n",
+    "    # update python version\n",
+    "    # install setuptools\n",
+    "    # %pip install setuptools==61 -qqq\n",
+    "    # install ammico\n",
+    "    %pip install git+https://github.com/ssciwr/ammico.git -qqq\n",
+    "    # mount google drive for data and API key\n",
+    "    from google.colab import drive\n",
+    "\n",
+    "    drive.mount(\"/content/drive\")"
   ]
  },
  {
@ -26,18 +54,10 @@
   },
   "outputs": [],
   "source": [
-    "import ammico\n",
+    "import ammico.utils as mutils\n",
    "import ammico.multimodal_search as ms"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "id": "acf08b44-3ea6-44cd-926d-15c0fd9f39e0",
-   "metadata": {},
-   "source": [
-    "Set an image path as input file path."
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -47,12 +67,22 @@
   },
   "outputs": [],
   "source": [
-    "images = ammico.utils.find_files(\n",
+    "images = mutils.find_files(\n",
    "    path=\"data/\",\n",
    "    limit=10,\n",
    ")"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a08bd3a9-e954-4a0e-ad64-6817abd3a25a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "images"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -62,10 +92,21 @@
   },
   "outputs": [],
   "source": [
-    "mydict = ammico.utils.initialize_dict(images)"
+    "mydict = mutils.initialize_dict(images)"
   ]
  },
  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c091f95-07cf-42c3-82c8-5f3a3c5929f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mydict"
+   ]
+  },
+  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "987540a8-d800-4c70-a76b-7bfabaf123fa",
   "metadata": {},
@ -74,11 +115,18 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "66d6ede4-00bc-4aeb-9a36-e52d7de33fe5",
   "metadata": {},
   "source": [
-    "You can choose one of the following models: blip, blip2, albef, clip_base, clip_vitl14, clip_vitl14_336"
+    "First you need to select a model. You can choose one of the following models: \n",
+    "- [blip](https://github.com/salesforce/BLIP)\n",
+    "- [blip2](https://huggingface.co/docs/transformers/main/model_doc/blip-2) \n",
+    "- [albef](https://github.com/salesforce/ALBEF) \n",
+    "- [clip_base](https://github.com/openai/CLIP/blob/main/model-card.md)\n",
+    "- [clip_vitl14](https://github.com/mlfoundations/open_clip) \n",
+    "- [clip_vitl14_336](https://github.com/mlfoundations/open_clip)"
   ]
  },
  {
@ -98,6 +146,35 @@
    "# model_type = \"clip_vitl14_336\""
   ]
  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "357828c9",
+   "metadata": {},
+   "source": [
+    "To process the loaded images using the selected model, use the below code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6f2c9b1-4a91-47cb-86b5-2c9c67e4837b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "my_obj = ms.MultimodalSearch(mydict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16603ded-078e-4362-847b-57ad76829327",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "my_obj.subdict"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -114,15 +191,31 @@
    "    image_keys,\n",
    "    image_names,\n",
    "    features_image_stacked,\n",
-    ") = ms.MultimodalSearch.parsing_images(mydict, model_type, path_to_saved_tensors=\".\")"
+    ") = my_obj.parsing_images(\n",
+    "    model_type, \n",
+    "    path_to_save_tensors=\"data/\",\n",
+    "    )"
   ]
  },
  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f236c3b1-c3a6-471a-9fc5-ef831b675286",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features_image_stacked"
+   ]
+  },
+  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "9ff8a894-566b-4c4f-acca-21c50b5b1f52",
   "metadata": {},
   "source": [
-    "The tensors of all images `features_image_stacked` was saved in `<Number_of_images>_<model_name>_saved_features_image.pt`. If you run it once for current model and current set of images you do not need to repeat it again. Instead you can load this features with the command:"
+    "The images are then processed and stored in a numerical representation, a tensor. These tensors do not change for the same image and same model - so if you run this analysis once, and save the tensors giving a path with the keyword `path_to_save_tensors`, a file with filename `.<Number_of_images>_<model_name>_saved_features_image.pt` will be placed there.\n",
+    "\n",
+    "This will save you a lot of time if you want to analyse same images with the same model but different questions. To run using the saved tensors, execute the below code giving the path and name of the tensor file."
   ]
  },
  {
@ -135,28 +228,35 @@
   "outputs": [],
   "source": [
    "# (\n",
-    "#    model,\n",
-    "#    vis_processors,\n",
-    "#    txt_processors,\n",
-    "#    image_keys,\n",
-    "#    image_names,\n",
-    "#    features_image_stacked,\n",
-    "# ) = ms.MultimodalSearch.parsing_images(mydict, model_type,\"18_clip_base_saved_features_image.pt\")"
+    "#     model,\n",
+    "#     vis_processors,\n",
+    "#     txt_processors,\n",
+    "#     image_keys,\n",
+    "#     image_names,\n",
+    "#     features_image_stacked,\n",
+    "# ) = my_obj.parsing_images(\n",
+    "#     model_type,\n",
+    "#     path_to_load_tensors=\"/content/drive/MyDrive/misinformation-data/5_clip_base_saved_features_image.pt\",\n",
+    "# )"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "309923c1-d6f8-4424-8fca-bde5f3a98b38",
   "metadata": {},
   "source": [
-    "Here we already processed our image folder with 18 images with `clip_base` model. So you need just write the name `18_clip_base_saved_features_image.pt` of the saved file that consists of tensors of all images as a 3rd argument to the previous function. "
+    "Here we already processed our image folder with 5 images and the `clip_base` model. So you need just to write the name `5_clip_base_saved_features_image.pt` of the saved file that consists of tensors of all images as keyword argument for `path_to_load_tensors`. "
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "162a52e8-6652-4897-b92e-645cab07aaef",
   "metadata": {},
   "source": [
+    "## Formulate your search queries\n",
+    "\n",
    "Next, you need to form search queries. You can search either by image or by text. You can search for a single query, or you can search for several queries at once, the computational time should not be much different. The format of the queries is as follows:"
   ]
  },
@ -171,12 +271,13 @@
   "source": [
    "search_query3 = [\n",
    "    {\"text_input\": \"politician press conference\"},\n",
-    "    {\"text_input\": \"a person wearing a mask\"},\n",
-    "    {\"image\": \"data/106349S_por.png\"},\n",
+    "    {\"text_input\": \"a world map\"},\n",
+    "    {\"text_input\": \"a dog\"},\n",
    "]"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "8bcf3127-3dfd-4ff4-b9e7-a043099b1418",
   "metadata": {},
@ -196,8 +297,7 @@
   },
   "outputs": [],
   "source": [
-    "similarity = ms.MultimodalSearch.multimodal_search(\n",
-    "    mydict,\n",
+    "similarity, sorted_lists = my_obj.multimodal_search(\n",
    "    model,\n",
    "    vis_processors,\n",
    "    txt_processors,\n",
@ -205,10 +305,42 @@
    "    image_keys,\n",
    "    features_image_stacked,\n",
    "    search_query3,\n",
+    "    filter_number_of_images=20,\n",
    ")"
   ]
  },
  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "65210ca2-b674-44bd-807a-4165e14bad74",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "similarity"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "557473df-e2b9-4ef0-9439-3daadf6741ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sorted_lists"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c93d7e88-594d-4095-b5f2-7bf01210dc61",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mydict"
+   ]
+  },
+  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "e1cf7e46-0c2c-4fb2-b89a-ef585ccb9339",
   "metadata": {},
@ -229,6 +361,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "cd3ee120-8561-482b-a76a-e8f996783325",
   "metadata": {},
@ -245,10 +378,80 @@
   },
   "outputs": [],
   "source": [
-    "ms.MultimodalSearch.show_results(mydict, search_query3[0])"
+    "my_obj.show_results(\n",
+    "    search_query3[0],\n",
+    ")"
   ]
  },
  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "0b750e9f-fe64-4028-9caf-52d7187462f1",
+   "metadata": {},
+   "source": [
+    "## Improve the search results\n",
+    "\n",
+    "For even better results, a slightly different approach has been prepared that can improve search results. It is quite resource-intensive, so it is applied after the main algorithm has found the most relevant images. This approach works only with text queries. Among the parameters you can choose 3 models: `\"blip_base\"`, `\"blip_large\"`, `\"blip2_coco\"`. If you get an `Out of Memory` error, try reducing the batch_size value (minimum = 1), which is the number of images being processed simultaneously. With the parameter `need_grad_cam = True/False` you can enable the calculation of the heat map of each image to be processed. Thus the `image_text_match_reordering` function calculates new similarity values and new ranks for each image. The resulting values are added to the general dictionary."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b3af7b39-6d0d-4da3-9b8f-7dfd3f5779be",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "itm_model = \"blip_base\"\n",
+    "# itm_model = \"blip_large\"\n",
+    "# itm_model = \"blip2_coco\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "caf1f4ae-4b37-4954-800e-7120f0419de5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "itm_scores, image_gradcam_with_itm = my_obj.image_text_match_reordering(\n",
+    "    search_query3,\n",
+    "    itm_model,\n",
+    "    image_keys,\n",
+    "    sorted_lists,\n",
+    "    batch_size=1,\n",
+    "    need_grad_cam=True,\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "9e98c150-5fab-4251-bce7-0d8fc7b385b9",
+   "metadata": {},
+   "source": [
+    "Then using the same output function you can add the `ITM=True` arguments to output the new image order. You can also add the `image_gradcam_with_itm` argument to output the heat maps of the calculated images. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a829b99-5230-463a-8b11-30ffbb67fc3a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "my_obj.show_results(\n",
+    "    search_query3[0], itm=True, image_gradcam_with_itm=image_gradcam_with_itm\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "d86ab96b-1907-4b7f-a78e-3983b516d781",
   "metadata": {
@ -259,6 +462,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "4bdbc4d4-695d-4751-ab7c-d2d98e2917d7",
   "metadata": {
@ -277,11 +481,12 @@
   },
   "outputs": [],
   "source": [
-    "outdict = ammico.utils.append_data_to_dict(mydict)\n",
-    "df = ammico.utils.dump_df(outdict)"
+    "outdict = mutils.append_data_to_dict(mydict)\n",
+    "df = mutils.dump_df(outdict)"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "ea2675d5-604c-45e7-86d2-080b1f4559a0",
   "metadata": {
@ -304,6 +509,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "05546d99-afab-4565-8f30-f14e1426abcf",
   "metadata": {},
@ -320,13 +526,13 @@
   },
   "outputs": [],
   "source": [
-    "df.to_csv(\"./data_out.csv\")"
+    "df.to_csv(\"data/data_out.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "2ef1132f-eb2a-43d7-be1f-69e879490f33",
+   "id": "b6a79201-7c17-496c-a6a1-b8ecfd3dd1e8",
   "metadata": {},
   "outputs": [],
   "source": []
--- a/docs/source/notebooks/Example
+++ b/docs/source/notebooks/Example
@ -1,10 +1,11 @@
 {
 "cells": [
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# Objects Expression recognition"
+    "# Objects recognition"
   ]
  },
  {
@ -12,15 +13,39 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "This notebooks shows some preliminary work on detecting objects expressions with cvlib. It is mainly meant to explore its capabilities and to decide on future research directions. We package our code into a `ammico` package that is imported here:"
+    "This notebooks shows how to detect objects quickly using [cvlib](https://github.com/arunponnusamy/cvlib) and the [YOLOv4](https://github.com/AlexeyAB/darknet) model. This library detects faces, people, and several inanimate objects; we currently have restricted the output to person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic light, cell phone.\n",
+    "\n",
+    "The first cell is only run on google colab and installs the [ammico](https://github.com/ssciwr/AMMICO) package.\n",
+    "\n",
+    "After that, we can import `ammico` and read in the files given a folder path."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# if running on google colab\n",
+    "# flake8-noqa-cell\n",
+    "import os\n",
+    "\n",
+    "if \"google.colab\" in str(get_ipython()):\n",
+    "    # update python version\n",
+    "    # install setuptools\n",
+    "    # %pip install setuptools==61 -qqq\n",
+    "    # install ammico\n",
+    "    %pip install git+https://github.com/ssciwr/ammico.git -qqq\n",
+    "    # mount google drive for data and API key\n",
+    "    from google.colab import drive\n",
+    "\n",
+    "    drive.mount(\"/content/drive\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
   "outputs": [],
   "source": [
    "import ammico\n",
@ -30,6 +55,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -39,11 +65,11 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
+    "# Here you need to provide the path to your google drive folder\n",
+    "# or local folder containing the images\n",
    "images = mutils.find_files(\n",
    "    path=\"data/\",\n",
    "    limit=10,\n",
@ -53,47 +79,25 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "mydict = mutils.initialize_dict(images)"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Manually inspect what was detected\n",
-    "\n",
-    "To check the analysis, you can inspect the analyzed elements here. Loading the results takes a moment, so please be patient. If you are sure of what you are doing."
+    "## Detect objects and directly write to csv\n",
+    "You can directly carry out the analysis and export the result into a csv. This may take a while depending on how many images you have loaded."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "mdisplay.explore_analysis(mydict, identify=\"objects\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
   "metadata": {},
-   "source": [
-    "## Detect objects and directly write to csv"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
   "outputs": [],
   "source": [
    "for key in mydict:\n",
@ -101,6 +105,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -110,9 +115,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "outdict = mutils.append_data_to_dict(mydict)\n",
@ -120,6 +123,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -129,15 +133,14 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "df.head(10)"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -147,12 +150,31 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
-    "df.to_csv(\"./data_out.csv\")"
+    "df.to_csv(\"data_out.csv\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Manually inspect what was detected\n",
+    "\n",
+    "To check the analysis, you can inspect the analyzed elements here. Loading the results takes a moment, so please be patient. If you are sure of what you are doing, you can directly export a csv file in the step above.\n",
+    "Here, we display the object detection results provided by the above library. Click on the tabs to see the results in the right sidebar. You may need to increment the `port` number if you are already running several notebook instances on the same server."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "analysis_explorer = mdisplay.AnalysisExplorer(mydict, identify=\"objects\")\n",
+    "analysis_explorer.run_server(port=8056)"
   ]
  },
  {
@ -165,7 +187,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
--- a/docs/source/notebooks/Example
+++ b/docs/source/notebooks/Example
@ -1,6 +1,7 @@
 {
 "cells": [
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -12,7 +13,33 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "This notebooks shows some preliminary work on Image Captioning and Visual question answering with lavis. It is mainly meant to explore its capabilities and to decide on future research directions. We package our code into a `ammico` package that is imported here:"
+    "This notebooks shows how to generate image captions and use the visual question answering with [LAVIS](https://github.com/salesforce/LAVIS). \n",
+    "\n",
+    "The first cell is only run on google colab and installs the [ammico](https://github.com/ssciwr/AMMICO) package.\n",
+    "\n",
+    "After that, we can import `ammico` and read in the files given a folder path."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# if running on google colab\n",
+    "# flake8-noqa-cell\n",
+    "import os\n",
+    "\n",
+    "if \"google.colab\" in str(get_ipython()):\n",
+    "    # update python version\n",
+    "    # install setuptools\n",
+    "    # %pip install setuptools==61 -qqq\n",
+    "    # install ammico\n",
+    "    %pip install git+https://github.com/ssciwr/ammico.git -qqq\n",
+    "    # mount google drive for data and API key\n",
+    "    from google.colab import drive\n",
+    "\n",
+    "    drive.mount(\"/content/drive\")"
   ]
  },
  {
@ -23,18 +50,12 @@
   },
   "outputs": [],
   "source": [
+    "import ammico\n",
    "from ammico import utils as mutils\n",
    "from ammico import display as mdisplay\n",
    "import ammico.summary as sm"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Set an image path as input file path."
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -43,6 +64,8 @@
   },
   "outputs": [],
   "source": [
+    "# Here you need to provide the path to your google drive folder\n",
+    "# or local folder containing the images\n",
    "images = mutils.find_files(\n",
    "    path=\"data/\",\n",
    "    limit=10,\n",
@ -61,6 +84,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -68,10 +92,13 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Here you can choose between two models: \"base\" or \"large\""
+    "Here you can choose between two models: \"base\" or \"large\". This will generate the caption for each image and directly put the results in a dataframe. This dataframe can be exported as a csv file.\n",
+    "\n",
+    "The results are written into the columns `const_image_summary` - this will always be the same result (as always the same seed will be used). The column `3_non-deterministic summary` displays three different answers generated with different seeds, these are most likely different when you run the analysis again."
   ]
  },
  {
@ -83,8 +110,8 @@
   "outputs": [],
   "source": [
    "obj = sm.SummaryDetector(mydict)\n",
-    "summary_model, summary_vis_processors = obj.load_model(\"base\")\n",
-    "# summary_model, summary_vis_processors = obj.load_model(\"large\")"
+    "summary_model, summary_vis_processors = obj.load_model(model_type=\"base\")\n",
+    "# summary_model, summary_vis_processors = mutils.load_model(\"large\")"
   ]
  },
  {
@ -97,17 +124,18 @@
   "source": [
    "for key in mydict:\n",
    "    mydict[key] = sm.SummaryDetector(mydict[key]).analyse_image(\n",
-    "        summary_model, summary_vis_processors\n",
+    "        summary_model=summary_model, summary_vis_processors=summary_vis_processors\n",
    "    )"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
-    "Convert the dictionary of dictionaries into a dictionary with lists:"
+    "Convert the dictionary of dictionarys into a dictionary with lists:"
   ]
  },
  {
@ -123,6 +151,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -141,6 +170,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -150,15 +180,14 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
-    "df.to_csv(\"./data_out.csv\")"
+    "df.to_csv(\"data_out.csv\")"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -179,10 +208,12 @@
   },
   "outputs": [],
   "source": [
-    "mdisplay.explore_analysis(mydict, identify=\"summary\")"
+    "analysis_explorer = mdisplay.AnalysisExplorer(mydict, identify=\"summary\")\n",
+    "analysis_explorer.run_server(port=8055)"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -190,10 +221,11 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Set the list of questions"
+    "Set the list of questions as a list of strings:"
   ]
  },
  {
@ -209,6 +241,33 @@
    "]"
   ]
  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Explore the analysis using the interface:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "analysis_explorer = mdisplay.AnalysisExplorer(mydict, identify=\"summary\")\n",
+    "analysis_explorer.run_server(port=8055)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Or directly analyze for further processing\n",
+    "Instead of inspecting each of the images, you can also directly carry out the analysis and export the result into a csv. This may take a while depending on how many images you have loaded."
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -220,19 +279,12 @@
   ]
  },
  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mdisplay.explore_analysis(mydict, identify=\"summary\")"
-   ]
-  },
-  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Convert the dictionary of dictionarys into a dictionary with lists:"
+    "## Convert to dataframe and write csv\n",
+    "These steps are required to convert the dictionary of dictionarys into a dictionary with lists, that can be converted into a pandas dataframe and exported to a csv file."
   ]
  },
  {
@ -260,7 +312,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "df2.to_csv(\"./data_out2.csv\")"
+    "df2.to_csv(\"data_out2.csv\")"
   ]
  },
  {
@ -273,7 +325,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
--- a/docs/source/notebooks/Example
+++ b/docs/source/notebooks/Example
@ -1,21 +1,36 @@
 {
 "cells": [
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "dcaa3da1",
   "metadata": {},
   "source": [
-    "# Text extraction on image\n",
-    "Inga Ulusoy, SSC, July 2022"
+    "# Notebook for text extraction on image\n",
+    "\n",
+    "The text extraction and analysis is carried out using a variety of tools:  \n",
+    "\n",
+    "1. Text extraction from the image using [google-cloud-vision](https://cloud.google.com/vision)  \n",
+    "1. Language detection of the extracted text using [Googletrans](https://py-googletrans.readthedocs.io/en/latest/)  \n",
+    "1. Translation into English or other languages using [Googletrans](https://py-googletrans.readthedocs.io/en/latest/) \n",
+    "1. Cleaning of the text using [spacy](https://spacy.io/) \n",
+    "1. Spell-check using [TextBlob](https://textblob.readthedocs.io/en/dev/index.html) \n",
+    "1. Subjectivity analysis using [TextBlob](https://textblob.readthedocs.io/en/dev/index.html) \n",
+    "1. Text summarization using [transformers](https://huggingface.co/docs/transformers/index) pipelines\n",
+    "1. Sentiment analysis using [transformers](https://huggingface.co/docs/transformers/index) pipelines \n",
+    "1. Named entity recognition using [transformers](https://huggingface.co/docs/transformers/index) pipelines \n",
+    "1. Topic analysis using [BERTopic](https://github.com/MaartenGr/BERTopic) \n",
+    "\n",
+    "The first cell is only run on google colab and installs the [ammico](https://github.com/ssciwr/AMMICO) package.\n",
+    "\n",
+    "After that, we can import `ammico` and read in the files given a folder path."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f43f327c",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# if running on google colab\n",
@ -25,9 +40,9 @@
    "if \"google.colab\" in str(get_ipython()):\n",
    "    # update python version\n",
    "    # install setuptools\n",
-    "    !pip install setuptools==61 -qqq\n",
+    "    # %pip install setuptools==61 -qqq\n",
    "    # install ammico\n",
-    "    !pip install git+https://github.com/ssciwr/ammico.git -qqq\n",
+    "    %pip install git+https://github.com/ssciwr/ammico.git -qqq\n",
    "    # mount google drive for data and API key\n",
    "    from google.colab import drive\n",
    "\n",
@ -38,49 +53,53 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "cf362e60",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
+    "import os\n",
    "import ammico\n",
    "from ammico import utils as mutils\n",
    "from ammico import display as mdisplay"
   ]
  },
  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "27675810",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "fddba721",
+   "metadata": {},
   "source": [
-    "# download the models if they are not there yet\n",
-    "!python -m spacy download en_core_web_md\n",
-    "!python -m textblob.download_corpora"
+    "We select a subset of image files to try the text extraction on, see the `limit` keyword. The `find_files` function finds image files within a given directory: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "6da3a7aa",
-   "metadata": {
-    "tags": []
-   },
+   "id": "27675810",
+   "metadata": {},
   "outputs": [],
   "source": [
-    "images = mutils.find_files(path=\"data\", limit=10)"
+    "# Here you need to provide the path to your google drive folder\n",
+    "# or local folder containing the images\n",
+    "images = mutils.find_files(\n",
+    "    path=\"data/\",\n",
+    "    limit=10,\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "3a7dfe11",
+   "metadata": {},
+   "source": [
+    "We need to initialize the main dictionary that contains all information for the images and is updated through each subsequent analysis:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b32409f",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "mydict = mutils.initialize_dict(images)"
@ -91,45 +110,61 @@
   "id": "7b8b929f",
   "metadata": {},
   "source": [
-    "## google cloud vision API\n",
-    "First 1000 images per month are free."
+    "## Google cloud vision API\n",
+    "\n",
+    "For this you need an API key and have the app activated in your google console. The first 1000 images per month are free (July 2022)."
   ]
  },
  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "cbf74c0b-52fe-4fb8-b617-f18611e8f986",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "os.environ[\n",
+    "    \"GOOGLE_APPLICATION_CREDENTIALS\"\n",
+    "] = \"your-credentials.json\"\n",
+    "```"
+   ]
+  },
+  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "0891b795-c7fe-454c-a45d-45fadf788142",
   "metadata": {},
   "source": [
-    "## Inspect the elements per image"
+    "## Inspect the elements per image\n",
+    "To check the analysis, you can inspect the analyzed elements here. Loading the results takes a moment, so please be patient. If you are sure of what you are doing, you can skip this and directly export a csv file in the step below.\n",
+    "Here, we display the text extraction and translation results provided by the above libraries. Click on the tabs to see the results in the right sidebar. You may need to increment the `port` number if you are already running several notebook instances on the same server."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c6ecc88",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
-    "mdisplay.explore_analysis(mydict, identify=\"text-on-image\")"
+    "analysis_explorer = mdisplay.AnalysisExplorer(mydict, identify=\"text-on-image\")\n",
+    "analysis_explorer.run_server(port=8054)"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "9c3e72b5-0e57-4019-b45e-3e36a74e7f52",
   "metadata": {},
   "source": [
-    "## Or directly analyze for further processing"
+    "## Or directly analyze for further processing\n",
+    "Instead of inspecting each of the images, you can also directly carry out the analysis and export the result into a csv. This may take a while depending on how many images you have loaded. Set the keyword `analyse_text` to `True` if you want the text to be analyzed (spell check, subjectivity, text summary, sentiment, NER)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "365c78b1-7ff4-4213-86fa-6a0a2d05198f",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "for key in mydict:\n",
@ -139,56 +174,211 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "3c063eda",
   "metadata": {},
   "source": [
-    "## Convert to dataframe and write csv"
+    "## Convert to dataframe and write csv\n",
+    "These steps are required to convert the dictionary of dictionarys into a dictionary with lists, that can be converted into a pandas dataframe and exported to a csv file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5709c2cd",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "outdict = mutils.append_data_to_dict(mydict)\n",
    "df = mutils.dump_df(outdict)"
   ]
  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "ae182eb7",
+   "metadata": {},
+   "source": [
+    "Check the dataframe:"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4f05637",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
-    "# check the dataframe\n",
    "df.head(10)"
   ]
  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "eedf1e47",
+   "metadata": {},
+   "source": [
+    "Write the csv file - here you should provide a file path and file name for the csv file to be written."
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf6c9ddb",
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Write the csv\n",
    "df.to_csv(\"./data_out.csv\")"
   ]
  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "4bc8ac0a",
+   "metadata": {},
+   "source": [
+    "## Topic analysis\n",
+    "The topic analysis is carried out using [BERTopic](https://maartengr.github.io/BERTopic/index.html) using an embedded model through a [spaCy](https://spacy.io/) pipeline."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "4931941b",
+   "metadata": {},
+   "source": [
+    "BERTopic takes a list of strings as input. The more items in the list, the better for the topic modeling. If the below returns an error for `analyse_topic()`, the reason can be that your dataset is too small.\n",
+    "\n",
+    "You can pass which dataframe entry you would like to have analyzed. The default is `text_english`, but you could for example also select `text_summary` or `text_english_correct` setting the keyword `analyze_text` as so:\n",
+    "\n",
+    "`ammico.text.PostprocessText(mydict=mydict, analyze_text=\"text_summary\").analyse_topic()`\n",
+    "\n",
+    "### Option 1: Use the dictionary as obtained from the above analysis."
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "9012544e-f818-46ea-b087-3e150850a5d5",
+   "id": "a3450a61",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make a list of all the text_english entries per analysed image from the mydict variable as above\n",
+    "topic_model, topic_df, most_frequent_topics = ammico.text.PostprocessText(\n",
+    "    mydict=mydict\n",
+    ").analyse_topic()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "95667342",
+   "metadata": {},
+   "source": [
+    "### Option 2: Read in a csv\n",
+    "Not to analyse too many images on google Cloud Vision, use the csv output to obtain the text (when rerunning already analysed images)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5530e436",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_file_path = \"data_out.csv\"\n",
+    "topic_model, topic_df, most_frequent_topics = ammico.text.PostprocessText(\n",
+    "    use_csv=True, csv_path=input_file_path\n",
+    ").analyse_topic(return_topics=10)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "0b6ef6d7",
+   "metadata": {},
+   "source": [
+    "### Access frequent topics\n",
+    "A topic of `-1` stands for an outlier and should be ignored. Topic count is the number of occurence of that topic. The output is structured from most frequent to least frequent topic."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43288cda-61bb-4ff1-a209-dcfcc4916b1f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(topic_df)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "b3316770",
+   "metadata": {},
+   "source": [
+    "### Get information for specific topic\n",
+    "The most frequent topics can be accessed through `most_frequent_topics` with the most occuring topics first in the list."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "db14fe03",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for topic in most_frequent_topics:\n",
+    "    print(\"Topic:\", topic)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "d10f701e",
+   "metadata": {},
+   "source": [
+    "### Topic visualization\n",
+    "The topics can also be visualized. Careful: This only works if there is sufficient data (quantity and quality)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2331afe6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "topic_model.visualize_topics()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "f4eaf353",
+   "metadata": {},
+   "source": [
+    "### Save the model\n",
+    "The model can be saved for future use."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5e8377c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "topic_model.save(\"misinfo_posts\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c94edb9",
   "metadata": {},
   "outputs": [],
   "source": []
@ -196,7 +386,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
--- a/notebooks/multimodal_search.ipynb
+++ b/notebooks/multimodal_search.ipynb
@ -1,6 +1,7 @@
 {
 "cells": [
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "22df2297-0629-45aa-b88c-6c61f1544db6",
   "metadata": {},
@ -9,6 +10,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "9eeeb302-296e-48dc-86c7-254aa02f2b3a",
   "metadata": {},
@ -71,6 +73,16 @@
    ")"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a08bd3a9-e954-4a0e-ad64-6817abd3a25a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "images"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -84,6 +96,17 @@
   ]
  },
  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c091f95-07cf-42c3-82c8-5f3a3c5929f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mydict"
+   ]
+  },
+  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "987540a8-d800-4c70-a76b-7bfabaf123fa",
   "metadata": {},
@ -92,6 +115,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "66d6ede4-00bc-4aeb-9a36-e52d7de33fe5",
   "metadata": {},
@ -123,6 +147,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "357828c9",
   "metadata": {},
@ -130,6 +155,26 @@
    "To process the loaded images using the selected model, use the below code:"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6f2c9b1-4a91-47cb-86b5-2c9c67e4837b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "my_obj = ms.MultimodalSearch(mydict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16603ded-078e-4362-847b-57ad76829327",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "my_obj.subdict"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -146,19 +191,29 @@
    "    image_keys,\n",
    "    image_names,\n",
    "    features_image_stacked,\n",
-    ") = ms.MultimodalSearch.parsing_images(\n",
-    "    mydict, \n",
+    ") = my_obj.parsing_images(\n",
    "    model_type, \n",
-    "    path_to_saved_tensors=\"/content/drive/MyDrive/misinformation-data/\"\n",
+    "    path_to_save_tensors=\"/content/drive/MyDrive/misinformation-data/\",\n",
    "    )"
   ]
  },
  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f236c3b1-c3a6-471a-9fc5-ef831b675286",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features_image_stacked"
+   ]
+  },
+  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "9ff8a894-566b-4c4f-acca-21c50b5b1f52",
   "metadata": {},
   "source": [
-    "The images are then processed and stored in a numerical representation, a tensor. These tensors do not change for the same image and same model - so if you run this analysis once, and save the tensors giving a path with the keyword `path_to_saved_tensors`, a file with filename `.<Number_of_images>_<model_name>_saved_features_image.pt` will be placed there.\n",
+    "The images are then processed and stored in a numerical representation, a tensor. These tensors do not change for the same image and same model - so if you run this analysis once, and save the tensors giving a path with the keyword `path_to_save_tensors`, a file with filename `.<Number_of_images>_<model_name>_saved_features_image.pt` will be placed there.\n",
    "\n",
    "This will save you a lot of time if you want to analyse same images with the same model but different questions. To run using the saved tensors, execute the below code giving the path and name of the tensor file."
   ]
@ -179,14 +234,14 @@
    "#     image_keys,\n",
    "#     image_names,\n",
    "#     features_image_stacked,\n",
-    "# ) = ms.MultimodalSearch.parsing_images(\n",
-    "#     mydict,\n",
+    "# ) = my_obj.parsing_images(\n",
    "#     model_type,\n",
-    "#     path_to_load_tensors=\".5_blip_saved_features_image.pt\",\n",
+    "#     path_to_load_tensors=\"/content/drive/MyDrive/misinformation-data/5_clip_base_saved_features_image.pt\",\n",
    "# )"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "309923c1-d6f8-4424-8fca-bde5f3a98b38",
   "metadata": {},
@ -195,6 +250,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "162a52e8-6652-4897-b92e-645cab07aaef",
   "metadata": {},
@ -221,6 +277,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "8bcf3127-3dfd-4ff4-b9e7-a043099b1418",
   "metadata": {},
@ -240,8 +297,7 @@
   },
   "outputs": [],
   "source": [
-    "similarity, sorted_lists = ms.MultimodalSearch.multimodal_search(\n",
-    "    mydict,\n",
+    "similarity, sorted_lists = my_obj.multimodal_search(\n",
    "    model,\n",
    "    vis_processors,\n",
    "    txt_processors,\n",
@ -254,6 +310,37 @@
   ]
  },
  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "65210ca2-b674-44bd-807a-4165e14bad74",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "similarity"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "557473df-e2b9-4ef0-9439-3daadf6741ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sorted_lists"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c93d7e88-594d-4095-b5f2-7bf01210dc61",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mydict"
+   ]
+  },
+  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "e1cf7e46-0c2c-4fb2-b89a-ef585ccb9339",
   "metadata": {},
@ -274,6 +361,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "cd3ee120-8561-482b-a76a-e8f996783325",
   "metadata": {},
@ -290,13 +378,13 @@
   },
   "outputs": [],
   "source": [
-    "ms.MultimodalSearch.show_results(\n",
-    "    mydict,\n",
+    "my_obj.show_results(\n",
    "    search_query3[0],\n",
    ")"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "0b750e9f-fe64-4028-9caf-52d7187462f1",
   "metadata": {},
@ -329,8 +417,7 @@
   },
   "outputs": [],
   "source": [
-    "itm_scores, image_gradcam_with_itm = ms.MultimodalSearch.image_text_match_reordering(\n",
-    "    mydict,\n",
+    "itm_scores, image_gradcam_with_itm = my_obj.image_text_match_reordering(\n",
    "    search_query3,\n",
    "    itm_model,\n",
    "    image_keys,\n",
@ -341,6 +428,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "9e98c150-5fab-4251-bce7-0d8fc7b385b9",
   "metadata": {},
@ -357,12 +445,13 @@
   },
   "outputs": [],
   "source": [
-    "ms.MultimodalSearch.show_results(\n",
-    "    mydict, search_query3[0], itm=True, image_gradcam_with_itm=image_gradcam_with_itm\n",
+    "my_obj.show_results(\n",
+    "    search_query3[0], itm=True, image_gradcam_with_itm=image_gradcam_with_itm\n",
    ")"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "d86ab96b-1907-4b7f-a78e-3983b516d781",
   "metadata": {
@ -373,6 +462,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "4bdbc4d4-695d-4751-ab7c-d2d98e2917d7",
   "metadata": {
@ -391,11 +481,12 @@
   },
   "outputs": [],
   "source": [
-    "outdict = ammico.utils.append_data_to_dict(mydict)\n",
-    "df = ammico.utils.dump_df(outdict)"
+    "outdict = mutils.append_data_to_dict(mydict)\n",
+    "df = mutils.dump_df(outdict)"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "ea2675d5-604c-45e7-86d2-080b1f4559a0",
   "metadata": {
@ -418,6 +509,7 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "05546d99-afab-4565-8f30-f14e1426abcf",
   "metadata": {},
@ -448,7 +540,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },