Merge 3e036514cd3c05e70bd74eddeabe5bdb09c38890 into 3f9e855aebddf6eddaa81de1cd883bc5bcf5d3bc

2025-10-29 21:16:06 +02:00 · 2025-10-22 15:24:47 +00:00 · 2025-10-22 15:24:47 +00:00 · 2ff2dffc59
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -31,7 +31,7 @@ jobs:
    - name: Run pytest 
      run: |
        cd ammico
-        python -m pytest -svv -m "not gcv" --cov=. --cov-report=xml
+        python -m pytest -svv -m "not gcv and not long" --cov=. --cov-report=xml
    - name: Upload coverage
      if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.11'
      uses: codecov/codecov-action@v3
--- a/ammico/init.py
+++ b/ammico/init.py
@ -1,7 +1,10 @@
 from ammico.display import AnalysisExplorer
 from ammico.faces import EmotionDetector, ethical_disclosure
+from ammico.model import MultimodalSummaryModel
 from ammico.text import TextDetector, TextAnalyzer, privacy_disclosure
-from ammico.utils import find_files, get_dataframe
+from ammico.image_summary import ImageSummaryDetector
+from ammico.utils import find_files, get_dataframe, AnalysisType, find_videos
+from ammico.video_summary import VideoSummaryDetector

 # Export the version defined in project metadata
 try:
@ -12,11 +15,16 @@ except ImportError:
    __version__ = "unknown"

 __all__ = [
+    "AnalysisType",
    "AnalysisExplorer",
    "EmotionDetector",
+    "MultimodalSummaryModel",
    "TextDetector",
    "TextAnalyzer",
+    "ImageSummaryDetector",
+    "VideoSummaryDetector",
    "find_files",
+    "find_videos",
    "get_dataframe",
    "ethical_disclosure",
    "privacy_disclosure",
--- a/ammico/display.py
+++ b/ammico/display.py
@ -1,10 +1,14 @@
 import ammico.faces as faces
 import ammico.text as text
 import ammico.colors as colors
+import ammico.image_summary as image_summary
+from ammico.model import MultimodalSummaryModel
 import pandas as pd
 from dash import html, Input, Output, dcc, State, Dash
 from PIL import Image
 import dash_bootstrap_components as dbc
+import warnings
+from typing import Dict, Any, List, Optional


 COLOR_SCHEMES = [
@ -94,6 +98,8 @@ class AnalysisExplorer:
            State("left_select_id", "options"),
            State("left_select_id", "value"),
            State("Dropdown_select_Detector", "value"),
+            State("Dropdown_analysis_type", "value"),
+            State("textarea_questions", "value"),
            State("setting_Text_analyse_text", "value"),
            State("setting_privacy_env_var", "value"),
            State("setting_Emotion_emotion_threshold", "value"),
@ -108,9 +114,15 @@ class AnalysisExplorer:
            Output("settings_TextDetector", "style"),
            Output("settings_EmotionDetector", "style"),
            Output("settings_ColorDetector", "style"),
+            Output("settings_VQA", "style"),
            Input("Dropdown_select_Detector", "value"),
        )(self._update_detector_setting)

+        self.app.callback(
+            Output("textarea_questions", "style"),
+            Input("Dropdown_analysis_type", "value"),
+        )(self._show_questions_textarea_on_demand)
+
    # I split the different sections into subfunctions for better clarity
    def _top_file_explorer(self, mydict: dict) -> html.Div:
        """Initialize the file explorer dropdown for selecting the file to be analyzed.
@ -157,14 +169,6 @@ class AnalysisExplorer:
                    id="settings_TextDetector",
                    style={"display": "none"},
                    children=[
-                        dbc.Row(
-                            dcc.Checklist(
-                                ["Analyse text"],
-                                ["Analyse text"],
-                                id="setting_Text_analyse_text",
-                                style={"margin-bottom": "10px"},
-                            ),
-                        ),
                        # row 1
                        dbc.Row(
                            dbc.Col(
@ -272,8 +276,69 @@ class AnalysisExplorer:
                        )
                    ],
                ),
+                # start VQA settings
+                html.Div(
+                    id="settings_VQA",
+                    style={"display": "none"},
+                    children=[
+                        dbc.Card(
+                            [
+                                dbc.CardBody(
+                                    [
+                                        dbc.Row(
+                                            dbc.Col(
+                                                dcc.Dropdown(
+                                                    id="Dropdown_analysis_type",
+                                                    options=[
+                                                        {"label": v, "value": v}
+                                                        for v in SUMMARY_ANALYSIS_TYPE
+                                                    ],
+                                                    value="summary_and_questions",
+                                                    clearable=False,
+                                                    style={
+                                                        "width": "100%",
+                                                        "minWidth": "240px",
+                                                        "maxWidth": "520px",
+                                                    },
+                                                ),
+                                            ),
+                                            justify="start",
+                                        ),
+                                        html.Div(style={"height": "8px"}),
+                                        dbc.Row(
+                                            [
+                                                dbc.Col(
+                                                    dcc.Textarea(
+                                                        id="textarea_questions",
+                                                        value="Are there people in the image?\nWhat is this picture about?",
+                                                        placeholder="One question per line...",
+                                                        style={
+                                                            "width": "100%",
+                                                            "minHeight": "160px",
+                                                            "height": "220px",
+                                                            "resize": "vertical",
+                                                            "overflow": "auto",
+                                                        },
+                                                        rows=8,
+                                                    ),
+                                                    width=12,
+                                                ),
+                                            ],
+                                            justify="start",
+                                        ),
+                                    ]
+                                )
+                            ],
+                            style={
+                                "width": "100%",
+                                "marginTop": "10px",
+                                "zIndex": 2000,
+                            },
+                        )
+                    ],
+                ),
            ],
-            style={"width": "100%", "display": "inline-block"},
+            style={"width": "100%", "display": "inline-block", "overflow": "visible"},
        )
        return settings_layout

@ -293,6 +358,7 @@ class AnalysisExplorer:
                                    "TextDetector",
                                    "EmotionDetector",
                                    "ColorDetector",
+                                    "VQA",
                                ],
                                value="TextDetector",
                                id="Dropdown_select_Detector",
@ -344,7 +410,7 @@ class AnalysisExplorer:
            port (int, optional): The port number to run the server on (default: 8050).
        """

-        self.app.run_server(debug=True, port=port)
+        self.app.run(debug=True, port=port)

    # Dash callbacks
    def update_picture(self, img_path: str):
@ -379,19 +445,27 @@ class AnalysisExplorer:

        if setting_input == "EmotionDetector":
            return display_none, display_flex, display_none, display_none
-
        if setting_input == "ColorDetector":
            return display_none, display_none, display_flex, display_none
-
+        if setting_input == "VQA":
+            return display_none, display_none, display_none, display_flex
        else:
            return display_none, display_none, display_none, display_none

+    def _parse_questions(self, text: Optional[str]) -> Optional[List[str]]:
+        if not text:
+            return None
+        qs = [q.strip() for q in text.splitlines() if q.strip()]
+        return qs if qs else None
+
    def _right_output_analysis(
        self,
        n_clicks,
        all_img_options: dict,
        current_img_value: str,
        detector_value: str,
+        analysis_type_value: str,
+        textarea_questions_value: str,
        settings_text_analyse_text: list,
        setting_privacy_env_var: str,
        setting_emotion_emotion_threshold: int,
@ -413,54 +487,75 @@ class AnalysisExplorer:
            "EmotionDetector": faces.EmotionDetector,
            "TextDetector": text.TextDetector,
            "ColorDetector": colors.ColorDetector,
+            "VQA": image_summary.ImageSummaryDetector,
        }

        # Get image ID from dropdown value, which is the filepath
        if current_img_value is None:
            return {}
        image_id = all_img_options[current_img_value]
-        # copy image so prvious runs don't leave their default values in the dict
-        image_copy = self.mydict[image_id].copy()
+        image_copy = self.mydict.get(image_id, {}).copy()

-        # detector value is the string name of the chosen detector
-        identify_function = identify_dict[detector_value]
-
-        if detector_value == "TextDetector":
-            analyse_text = (
-                True if settings_text_analyse_text == ["Analyse text"] else False
-            )
-            detector_class = identify_function(
-                image_copy,
-                analyse_text=analyse_text,
-                accept_privacy=(
-                    setting_privacy_env_var
-                    if setting_privacy_env_var
-                    else "PRIVACY_AMMICO"
-                ),
-            )
-        elif detector_value == "EmotionDetector":
-            detector_class = identify_function(
-                image_copy,
-                emotion_threshold=setting_emotion_emotion_threshold,
-                race_threshold=setting_emotion_race_threshold,
-                gender_threshold=setting_emotion_gender_threshold,
-                accept_disclosure=(
-                    setting_emotion_env_var
-                    if setting_emotion_env_var
-                    else "DISCLOSURE_AMMICO"
-                ),
-            )
-        elif detector_value == "ColorDetector":
-            detector_class = identify_function(
-                image_copy,
-                delta_e_method=setting_color_delta_e_method,
-            )
+        analysis_dict: Dict[str, Any] = {}
+        if detector_value == "VQA":
+            try:
+                qwen_model = MultimodalSummaryModel(
+                    model_id="Qwen/Qwen2.5-VL-3B-Instruct"
+                )  # TODO: allow user to specify model
+                vqa_cls = identify_dict.get("VQA")
+                vqa_detector = vqa_cls(qwen_model, subdict={})
+                questions_list = self._parse_questions(textarea_questions_value)
+                analysis_result = vqa_detector.analyse_image(
+                    image_copy,
+                    analysis_type=analysis_type_value,
+                    list_of_questions=questions_list,
+                    is_concise_summary=True,
+                    is_concise_answer=True,
+                )
+                analysis_dict = analysis_result or {}
+            except Exception as e:
+                warnings.warn(f"VQA/Image tasks failed: {e}")
+                analysis_dict = {"image_tasks_error": str(e)}
        else:
-            detector_class = identify_function(image_copy)
-        analysis_dict = detector_class.analyse_image()
+            # detector value is the string name of the chosen detector
+            identify_function = identify_dict[detector_value]

-        # Initialize an empty dictionary
-        new_analysis_dict = {}
+            if detector_value == "TextDetector":
+                analyse_text = (
+                    True if settings_text_analyse_text == ["Analyse text"] else False
+                )
+                detector_class = identify_function(
+                    image_copy,
+                    analyse_text=analyse_text,
+                    accept_privacy=(
+                        setting_privacy_env_var
+                        if setting_privacy_env_var
+                        else "PRIVACY_AMMICO"
+                    ),
+                )
+            elif detector_value == "EmotionDetector":
+                detector_class = identify_function(
+                    image_copy,
+                    emotion_threshold=setting_emotion_emotion_threshold,
+                    race_threshold=setting_emotion_race_threshold,
+                    gender_threshold=setting_emotion_gender_threshold,
+                    accept_disclosure=(
+                        setting_emotion_env_var
+                        if setting_emotion_env_var
+                        else "DISCLOSURE_AMMICO"
+                    ),
+                )
+            elif detector_value == "ColorDetector":
+                detector_class = identify_function(
+                    image_copy,
+                    delta_e_method=setting_color_delta_e_method,
+                )
+            else:
+                detector_class = identify_function(image_copy)
+
+            analysis_dict = detector_class.analyse_image()
+
+        new_analysis_dict: Dict[str, Any] = {}

        # Iterate over the items in the original dictionary
        for k, v in analysis_dict.items():
@ -480,3 +575,9 @@ class AnalysisExplorer:
        return dbc.Table.from_dataframe(
            df, striped=True, bordered=True, hover=True, index=True
        )
+
+    def _show_questions_textarea_on_demand(self, analysis_type_value: str) -> dict:
+        if analysis_type_value in ("questions", "summary_and_questions"):
+            return {"display": "block", "width": "100%"}
+        else:
+            return {"display": "none"}
--- a/ammico/image_summary.py
+++ b/ammico/image_summary.py
@ -0,0 +1,365 @@
+from ammico.utils import AnalysisMethod, AnalysisType
+from ammico.model import MultimodalSummaryModel
+
+import os
+import torch
+from PIL import Image
+import warnings
+
+from typing import List, Optional, Union, Dict, Any
+from collections.abc import Sequence as _Sequence
+from transformers import GenerationConfig
+from qwen_vl_utils import process_vision_info
+
+
+class ImageSummaryDetector(AnalysisMethod):
+    def __init__(
+        self,
+        summary_model: MultimodalSummaryModel,
+        subdict: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """
+        Class for analysing images using QWEN-2.5-VL model.
+        It provides methods for generating captions and answering questions about images.
+
+        Args:
+            summary_model ([type], optional): An instance of MultimodalSummaryModel to be used for analysis.
+            subdict (dict, optional): Dictionary containing the image to be analysed. Defaults to {}.
+
+        Returns:
+            None.
+        """
+        if subdict is None:
+            subdict = {}
+
+        super().__init__(subdict)
+        self.summary_model = summary_model
+
+    def _load_pil_if_needed(
+        self, filename: Union[str, os.PathLike, Image.Image]
+    ) -> Image.Image:
+        if isinstance(filename, (str, os.PathLike)):
+            return Image.open(filename).convert("RGB")
+        elif isinstance(filename, Image.Image):
+            return filename.convert("RGB")
+        else:
+            raise ValueError("filename must be a path or PIL.Image")
+
+    @staticmethod
+    def _is_sequence_but_not_str(obj: Any) -> bool:
+        """True for sequence-like but not a string/bytes/PIL.Image."""
+        return isinstance(obj, _Sequence) and not isinstance(
+            obj, (str, bytes, Image.Image)
+        )
+
+    def _prepare_inputs(
+        self, list_of_questions: list[str], entry: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, torch.Tensor]:
+        filename = entry.get("filename")
+        if filename is None:
+            raise ValueError("entry must contain key 'filename'")
+
+        if isinstance(filename, (str, os.PathLike, Image.Image)):
+            images_context = self._load_pil_if_needed(filename)
+        elif self._is_sequence_but_not_str(filename):
+            images_context = [self._load_pil_if_needed(i) for i in filename]
+        else:
+            raise ValueError(
+                "Unsupported 'filename' entry: expected path, PIL.Image, or sequence."
+            )
+
+        images_only_messages = [
+            {
+                "role": "user",
+                "content": [
+                    *(
+                        [{"type": "image", "image": img} for img in images_context]
+                        if isinstance(images_context, list)
+                        else [{"type": "image", "image": images_context}]
+                    )
+                ],
+            }
+        ]
+
+        try:
+            image_inputs, _ = process_vision_info(images_only_messages)
+        except Exception as e:
+            raise RuntimeError(f"Image processing failed: {e}")
+
+        texts: List[str] = []
+        for q in list_of_questions:
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        *(
+                            [
+                                {"type": "image", "image": image}
+                                for image in images_context
+                            ]
+                            if isinstance(images_context, list)
+                            else [{"type": "image", "image": images_context}]
+                        ),
+                        {"type": "text", "text": q},
+                    ],
+                }
+            ]
+            text = self.summary_model.processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+            texts.append(text)
+
+        images_batch = [image_inputs] * len(texts)
+        inputs = self.summary_model.processor(
+            text=texts,
+            images=images_batch,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = {k: v.to(self.summary_model.device) for k, v in inputs.items()}
+
+        return inputs
+
+    def analyse_image(
+        self,
+        entry: dict,
+        analysis_type: Union[str, AnalysisType] = AnalysisType.SUMMARY,
+        list_of_questions: Optional[List[str]] = None,
+        is_concise_summary: bool = True,
+        is_concise_answer: bool = True,
+    ) -> Dict[str, Any]:
+        """
+        Analyse a single image entry. Returns dict with keys depending on analysis_type:
+            - 'caption' (str) if summary requested
+            - 'vqa' (dict) if questions requested
+        """
+        self.subdict = entry
+        analysis_type, is_summary, is_questions = AnalysisType._validate_analysis_type(
+            analysis_type, list_of_questions
+        )
+
+        if is_summary:
+            try:
+                caps = self.generate_caption(
+                    entry,
+                    num_return_sequences=1,
+                    is_concise_summary=is_concise_summary,
+                )
+                self.subdict["caption"] = caps[0] if caps else ""
+            except Exception as e:
+                warnings.warn(f"Caption generation failed: {e}")
+
+        if is_questions:
+            try:
+                vqa_map = self.answer_questions(
+                    list_of_questions, entry, is_concise_answer
+                )
+                self.subdict["vqa"] = vqa_map
+            except Exception as e:
+                warnings.warn(f"VQA failed: {e}")
+
+        return self.subdict
+
+    def analyse_images_from_dict(
+        self,
+        analysis_type: Union[AnalysisType, str] = AnalysisType.SUMMARY,
+        list_of_questions: Optional[List[str]] = None,
+        keys_batch_size: int = 16,
+        is_concise_summary: bool = True,
+        is_concise_answer: bool = True,
+    ) -> Dict[str, dict]:
+        """
+        Analyse image with  model.
+
+        Args:
+            analysis_type (str): type of the analysis.
+            list_of_questions (list[str]): list of questions.
+            keys_batch_size (int): number of images to process in a batch.
+            is_concise_summary (bool): whether to generate concise summary.
+            is_concise_answer (bool): whether to generate concise answers.
+        Returns:
+            self.subdict (dict): dictionary with analysis results.
+        """
+        # TODO: add option to ask multiple questions per image as one batch.
+        analysis_type, is_summary, is_questions = AnalysisType._validate_analysis_type(
+            analysis_type, list_of_questions
+        )
+
+        keys = list(self.subdict.keys())
+        for batch_start in range(0, len(keys), keys_batch_size):
+            batch_keys = keys[batch_start : batch_start + keys_batch_size]
+            for key in batch_keys:
+                entry = self.subdict[key]
+                if is_summary:
+                    try:
+                        caps = self.generate_caption(
+                            entry,
+                            num_return_sequences=1,
+                            is_concise_summary=is_concise_summary,
+                        )
+                        entry["caption"] = caps[0] if caps else ""
+                    except Exception as e:
+                        warnings.warn(f"Caption generation failed: {e}")
+
+                if is_questions:
+                    try:
+                        vqa_map = self.answer_questions(
+                            list_of_questions, entry, is_concise_answer
+                        )
+                        entry["vqa"] = vqa_map
+                    except Exception as e:
+                        warnings.warn(f"VQA failed: {e}")
+
+                self.subdict[key] = entry
+        return self.subdict
+
+    def generate_caption(
+        self,
+        entry: Optional[Dict[str, Any]] = None,
+        num_return_sequences: int = 1,
+        is_concise_summary: bool = True,
+    ) -> List[str]:
+        """
+        Create caption for image. Depending on is_concise_summary it will be either concise or detailed.
+
+        Args:
+            entry (dict): dictionary containing the image to be captioned.
+            num_return_sequences (int): number of captions to generate.
+            is_concise_summary (bool): whether to generate concise summary.
+
+        Returns:
+            results (list[str]): list of generated captions.
+        """
+        if is_concise_summary:
+            prompt = ["Describe this image in one concise caption."]
+            max_new_tokens = 64
+        else:
+            prompt = ["Describe this image."]
+            max_new_tokens = 256
+        inputs = self._prepare_inputs(prompt, entry)
+
+        gen_conf = GenerationConfig(
+            max_new_tokens=max_new_tokens,
+            do_sample=False,
+            num_return_sequences=num_return_sequences,
+        )
+
+        with torch.inference_mode():
+            try:
+                if self.summary_model.device == "cuda":
+                    with torch.amp.autocast("cuda", enabled=True):
+                        generated_ids = self.summary_model.model.generate(
+                            **inputs, generation_config=gen_conf
+                        )
+                else:
+                    generated_ids = self.summary_model.model.generate(
+                        **inputs, generation_config=gen_conf
+                    )
+            except RuntimeError as e:
+                warnings.warn(
+                    f"Retry without autocast failed: {e}. Attempting cudnn-disabled retry."
+                )
+                cudnn_was_enabled = (
+                    torch.backends.cudnn.is_available() and torch.backends.cudnn.enabled
+                )
+                if cudnn_was_enabled:
+                    torch.backends.cudnn.enabled = False
+                try:
+                    generated_ids = self.summary_model.model.generate(
+                        **inputs, generation_config=gen_conf
+                    )
+                except Exception as retry_error:
+                    raise RuntimeError(
+                        f"Failed to generate ids after retry: {retry_error}"
+                    ) from retry_error
+                finally:
+                    if cudnn_was_enabled:
+                        torch.backends.cudnn.enabled = True
+
+        decoded = None
+        if "input_ids" in inputs:
+            in_ids = inputs["input_ids"]
+            trimmed = [
+                out_ids[len(inp_ids) :]
+                for inp_ids, out_ids in zip(in_ids, generated_ids)
+            ]
+            decoded = self.summary_model.tokenizer.batch_decode(
+                trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+            )
+        else:
+            decoded = self.summary_model.tokenizer.batch_decode(
+                generated_ids,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False,
+            )
+
+        results = [d.strip() for d in decoded]
+        return results
+
+    def answer_questions(
+        self,
+        list_of_questions: list[str],
+        entry: Optional[Dict[str, Any]] = None,
+        is_concise_answer: bool = True,
+    ) -> List[str]:
+        """
+        Create answers for list of questions about image.
+        Args:
+            list_of_questions (list[str]): list of questions.
+            entry (dict): dictionary containing the image to be captioned.
+            is_concise_answer (bool): whether to generate concise answers.
+        Returns:
+            answers (list[str]): list of answers.
+        """
+        if is_concise_answer:
+            gen_conf = GenerationConfig(max_new_tokens=64, do_sample=False)
+            for i in range(len(list_of_questions)):
+                if not list_of_questions[i].strip().endswith("?"):
+                    list_of_questions[i] = list_of_questions[i].strip() + "?"
+                if not list_of_questions[i].lower().startswith("answer concisely"):
+                    list_of_questions[i] = "Answer concisely: " + list_of_questions[i]
+        else:
+            gen_conf = GenerationConfig(max_new_tokens=128, do_sample=False)
+
+        question_chunk_size = 8
+        answers: List[str] = []
+        n = len(list_of_questions)
+        for i in range(0, n, question_chunk_size):
+            chunk = list_of_questions[i : i + question_chunk_size]
+            inputs = self._prepare_inputs(chunk, entry)
+            with torch.inference_mode():
+                if self.summary_model.device == "cuda":
+                    with torch.amp.autocast("cuda", enabled=True):
+                        out_ids = self.summary_model.model.generate(
+                            **inputs, generation_config=gen_conf
+                        )
+                else:
+                    out_ids = self.summary_model.model.generate(
+                        **inputs, generation_config=gen_conf
+                    )
+
+            if "input_ids" in inputs:
+                in_ids = inputs["input_ids"]
+                trimmed_batch = [
+                    out_row[len(inp_row) :] for inp_row, out_row in zip(in_ids, out_ids)
+                ]
+                decoded = self.summary_model.tokenizer.batch_decode(
+                    trimmed_batch,
+                    skip_special_tokens=True,
+                    clean_up_tokenization_spaces=False,
+                )
+            else:
+                decoded = self.summary_model.tokenizer.batch_decode(
+                    out_ids,
+                    skip_special_tokens=True,
+                    clean_up_tokenization_spaces=False,
+                )
+
+            answers.extend([d.strip() for d in decoded])
+
+        if len(answers) != len(list_of_questions):
+            raise ValueError(
+                f"Expected {len(list_of_questions)} answers, but got {len(answers)}, try vary amount of questions"
+            )
+
+        return answers
--- a/ammico/model.py
+++ b/ammico/model.py
@ -0,0 +1,126 @@
+import torch
+import warnings
+from transformers import (
+    Qwen2_5_VLForConditionalGeneration,
+    AutoProcessor,
+    BitsAndBytesConfig,
+    AutoTokenizer,
+)
+from typing import Optional
+
+
+class MultimodalSummaryModel:
+    DEFAULT_CUDA_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
+    DEFAULT_CPU_MODEL = "Qwen/Qwen2.5-VL-3B-Instruct"
+
+    def __init__(
+        self,
+        model_id: Optional[str] = None,
+        device: Optional[str] = None,
+        cache_dir: Optional[str] = None,
+    ) -> None:
+        """
+        Class for QWEN-2.5-VL model loading and inference.
+        Args:
+            model_id: Type of model to load, defaults to a smaller version for CPU if device is "cpu".
+            device: "cuda" or "cpu" (auto-detected when None).
+            cache_dir: huggingface cache dir (optional).
+        """
+        self.device = self._resolve_device(device)
+
+        if model_id is not None and model_id not in (
+            self.DEFAULT_CUDA_MODEL,
+            self.DEFAULT_CPU_MODEL,
+        ):
+            raise ValueError(
+                f"model_id must be one of {self.DEFAULT_CUDA_MODEL} or {self.DEFAULT_CPU_MODEL}"
+            )
+
+        self.model_id = model_id or (
+            self.DEFAULT_CUDA_MODEL if self.device == "cuda" else self.DEFAULT_CPU_MODEL
+        )
+
+        self.cache_dir = cache_dir
+        self._trust_remote_code = True
+        self._quantize = True
+
+        self.model = None
+        self.processor = None
+        self.tokenizer = None
+
+        self._load_model_and_processor()
+
+    @staticmethod
+    def _resolve_device(device: Optional[str]) -> str:
+        if device is None:
+            return "cuda" if torch.cuda.is_available() else "cpu"
+        if device.lower() not in ("cuda", "cpu"):
+            raise ValueError("device must be 'cuda' or 'cpu'")
+        if device.lower() == "cuda" and not torch.cuda.is_available():
+            warnings.warn(
+                "Although 'cuda' was requested, no CUDA device is available. Using CPU instead.",
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            return "cpu"
+        return device.lower()
+
+    def _load_model_and_processor(self):
+        load_kwargs = {"trust_remote_code": self._trust_remote_code, "use_cache": True}
+        if self.cache_dir:
+            load_kwargs["cache_dir"] = self.cache_dir
+
+        self.processor = AutoProcessor.from_pretrained(
+            self.model_id, padding_side="left", **load_kwargs
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, **load_kwargs)
+
+        if self.device == "cuda":
+            compute_dtype = (
+                torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+            )
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=compute_dtype,
+            )
+            load_kwargs["quantization_config"] = bnb_config
+            load_kwargs["device_map"] = "auto"
+
+        else:
+            load_kwargs.pop("quantization_config", None)
+            load_kwargs.pop("device_map", None)
+
+        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            self.model_id, **load_kwargs
+        )
+        self.model.eval()
+
+    def _close(self) -> None:
+        """Free model resources (helpful in long-running processes)."""
+        try:
+            if self.model is not None:
+                del self.model
+                self.model = None
+            if self.processor is not None:
+                del self.processor
+                self.processor = None
+            if self.tokenizer is not None:
+                del self.tokenizer
+                self.tokenizer = None
+        finally:
+            try:
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+            except Exception as e:
+                warnings.warn(
+                    "Failed to empty CUDA cache. This is not critical, but may lead to memory lingering: "
+                    f"{e!r}",
+                    RuntimeWarning,
+                    stacklevel=2,
+                )
+
+    def close(self) -> None:
+        """Free model resources (helpful in long-running processes)."""
+        self._close()
--- a/ammico/notebooks/DemoImageSummaryVQA.ipynb
+++ b/ammico/notebooks/DemoImageSummaryVQA.ipynb
@ -0,0 +1,190 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0",
+   "metadata": {},
+   "source": [
+    "# Image summary and visual question answering"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1",
+   "metadata": {},
+   "source": [
+    "This notebook shows how to generate image captions and use the visual question answering with AMMICO. \n",
+    "\n",
+    "The first cell imports `ammico`.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ammico"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3",
+   "metadata": {},
+   "source": [
+    "The cell below loads the model for VQA tasks. By default, it loads a large model on the GPU (if your device supports CUDA), otherwise it loads a relatively smaller model on the CPU. But you can specify other settings (e.g., a small model on the GPU) if you want."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = ammico.MultimodalSummaryModel()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5",
+   "metadata": {},
+   "source": [
+    "Here you need to provide the path to your google drive folder or local folder containing the images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_dict = ammico.find_files(\n",
+    "    path=str(\"/insert/your/path/here/\"),\n",
+    "    limit=-1,  # -1 means no limit on the number of files, by default it is set to 20\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7",
+   "metadata": {},
+   "source": [
+    "The cell below creates an object that analyzes images and generates a summary using a specific model and image data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "img = ammico.ImageSummaryDetector(summary_model=model, subdict=image_dict)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9",
+   "metadata": {},
+   "source": [
+    "## Image summary "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10",
+   "metadata": {},
+   "source": [
+    "To start your work with images, you should call the `analyse_images` method.\n",
+    "\n",
+    "You can specify what kind of analysis you want to perform with `analysis_type`. `\"summary\"` will generate a summary for all pictures in your dictionary, `\"questions\"` will prepare answers to your questions for all pictures, and `\"summary_and_questions\"` will do both.\n",
+    "\n",
+    "Parameter `\"is_concise_summary\"` regulates the length of an answer.\n",
+    "\n",
+    "Here we want to get a long summary on each object in our image dictionary."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summaries = img.analyse_images(analysis_type=\"summary\", is_concise_summary=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "12",
+   "metadata": {},
+   "source": [
+    "## VQA"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "13",
+   "metadata": {},
+   "source": [
+    "In addition to analyzing images in `ammico`, the same model can be used in VQA mode. To do this, you need to define the questions that will be applied to all images from your dict."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "14",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "questions = [\"Are there any visible signs of violence?\", \"Is it safe to be there?\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15",
+   "metadata": {},
+   "source": [
+    "Here is an example of VQA mode usage. You can specify whether you want to receive short answers (recommended option) or not."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vqa_results = img.analyse_images(\n",
+    "    analysis_type=\"questions\",\n",
+    "    list_of_questions=questions,\n",
+    "    is_concise_answer=True,\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ammico-dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/ammico/notebooks/DemoNotebook_ammico.ipynb
+++ b/ammico/notebooks/DemoNotebook_ammico.ipynb
@ -104,7 +104,8 @@
    "import ammico\n",
    "\n",
    "# for displaying a progress bar\n",
-    "from tqdm import tqdm"
+    "from tqdm import tqdm\n",
+    "import os"
   ]
  },
  {
@ -140,7 +141,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = \"/content/drive/MyDrive/misinformation-data/misinformation-campaign-981aa55a3b13.json\""
+    "os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = (\n",
+    "    \"/home/inga/projects/misinformation-project/misinformation-notes/misinformation-campaign-981aa55a3b13.json\"\n",
+    ")"
   ]
  },
  {
@ -171,6 +174,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "data_path = \"./data-test\"\n",
    "image_dict = ammico.find_files(\n",
    "    # path = \"/content/drive/MyDrive/misinformation-data/\",\n",
    "    path=str(data_path),\n",
@ -337,7 +341,7 @@
    "    enumerate(image_dict.keys()), total=len(image_dict)\n",
    "):  # loop through all images\n",
    "    image_dict[key] = ammico.TextDetector(\n",
-    "        image_dict[key], analyse_text=True\n",
+    "        image_dict[key]\n",
    "    ).analyse_image()  # analyse image with EmotionDetector and update dict\n",
    "\n",
    "    if (\n",
@ -361,23 +365,12 @@
   "outputs": [],
   "source": [
    "# initialize the models\n",
-    "image_summary_detector = ammico.SummaryDetector(\n",
-    "    subdict=image_dict, analysis_type=\"summary\", model_type=\"base\"\n",
+    "model = ammico.MultimodalSummaryModel()\n",
+    "image_summary_detector = ammico.ImageSummaryDetector(\n",
+    "    subdict=image_dict, summary_model=model\n",
    ")\n",
    "\n",
-    "# run the analysis without having to re-iniatialize the model\n",
-    "for num, key in tqdm(\n",
-    "    enumerate(image_dict.keys()), total=len(image_dict)\n",
-    "):  # loop through all images\n",
-    "    image_dict[key] = image_summary_detector.analyse_image(\n",
-    "        subdict=image_dict[key], analysis_type=\"summary\"\n",
-    "    )  # analyse image with SummaryDetector and update dict\n",
-    "\n",
-    "    if (\n",
-    "        num % dump_every == 0 | num == len(image_dict) - 1\n",
-    "    ):  # save results every dump_every to dump_file\n",
-    "        image_df = ammico.get_dataframe(image_dict)\n",
-    "        image_df.to_csv(dump_file)"
+    "image_summary_detector.analyse_images(analysis_type=\"summary\")"
   ]
  },
  {
@ -394,6 +387,7 @@
   "outputs": [],
   "source": [
    "# initialize the models\n",
+    "# currently this does not work because of the way the summary detector is implemented\n",
    "image_summary_detector = ammico.SummaryDetector(\n",
    "    subdict=image_dict, analysis_type=\"summary\", model_type=\"base\"\n",
    ")\n",
--- a/ammico/notebooks/DemoVideoSummaryVQA.ipynb
+++ b/ammico/notebooks/DemoVideoSummaryVQA.ipynb
@ -0,0 +1,96 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Video summary and visual question answering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ammico"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Currently this module supports only video summarization, but it will be updated in the nearest future"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "video_dict = ammico.find_videos(\n",
+    "    path=str(\"/insert/your/path/here/\"),  # path to the folder with images\n",
+    "    limit=-1,  # -1 means no limit on the number of files, by default it is set to 20\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = ammico.MultimodalSummaryModel()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vid_summary_model = ammico.VideoSummaryDetector(summary_model=model, subdict=video_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summary_dict = vid_summary_model.analyse_video()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summary_dict[\"summary\"]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ammico-dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/ammico/test/conftest.py
+++ b/ammico/test/conftest.py
@ -1,5 +1,6 @@
 import os
 import pytest
+from ammico.model import MultimodalSummaryModel


@pytest.fixture
@ -46,3 +47,12 @@ def get_test_my_dict(get_path):
        },
    }
    return test_my_dict
+
+
+@pytest.fixture(scope="session")
+def model():
+    m = MultimodalSummaryModel(device="cpu")
+    try:
+        yield m
+    finally:
+        m.close()
--- a/ammico/test/test_display.py
+++ b/ammico/test/test_display.py
@ -50,6 +50,8 @@ def test_right_output_analysis_emotions(get_AE, get_options, monkeypatch):
        get_options[3],
        get_options[0],
        "EmotionDetector",
+        "summary",
+        "Some question",
        True,
        "SOME_VAR",
        50,
--- a/ammico/test/test_image_summary.py
+++ b/ammico/test/test_image_summary.py
@ -0,0 +1,37 @@
+from ammico.image_summary import ImageSummaryDetector
+
+import pytest
+
+
+@pytest.mark.long
+def test_image_summary_detector(model, get_testdict):
+    detector = ImageSummaryDetector(summary_model=model, subdict=get_testdict)
+    results = detector.analyse_images_from_dict(analysis_type="summary")
+    assert len(results) == 2
+    for key in get_testdict.keys():
+        assert key in results
+        assert "caption" in results[key]
+        assert isinstance(results[key]["caption"], str)
+        assert len(results[key]["caption"]) > 0
+
+
+@pytest.mark.long
+def test_image_summary_detector_questions(model, get_testdict):
+    list_of_questions = [
+        "What is happening in the image?",
+        "How many cars are in the image in total?",
+    ]
+    detector = ImageSummaryDetector(summary_model=model, subdict=get_testdict)
+    results = detector.analyse_images_from_dict(
+        analysis_type="questions", list_of_questions=list_of_questions
+    )
+    assert len(results) == 2
+    for key in get_testdict.keys():
+        assert "vqa" in results[key]
+        if key == "IMG_2746":
+            assert "marathon" in results[key]["vqa"][0].lower()
+
+        if key == "IMG_2809":
+            assert (
+                "two" in results[key]["vqa"][1].lower() or "2" in results[key]["vqa"][1]
+            )
--- a/ammico/test/test_model.py
+++ b/ammico/test/test_model.py
@ -0,0 +1,30 @@
+import pytest
+from ammico.model import MultimodalSummaryModel
+
+
+@pytest.mark.long
+def test_model_init(model):
+    assert model.model is not None
+    assert model.processor is not None
+    assert model.tokenizer is not None
+    assert model.device is not None
+
+
+@pytest.mark.long
+def test_model_invalid_device():
+    with pytest.raises(ValueError):
+        MultimodalSummaryModel(device="invalid_device")
+
+
+@pytest.mark.long
+def test_model_invalid_model_id():
+    with pytest.raises(ValueError):
+        MultimodalSummaryModel(model_id="non_existent_model", device="cpu")
+
+
+@pytest.mark.long
+def test_free_resources():
+    model = MultimodalSummaryModel(device="cpu")
+    model.close()
+    assert model.model is None
+    assert model.processor is None
--- a/ammico/test/test_text.py
+++ b/ammico/test/test_text.py
@ -52,24 +52,16 @@ def test_privacy_statement(monkeypatch):
 def test_TextDetector(set_testdict, accepted):
    for item in set_testdict:
        test_obj = tt.TextDetector(set_testdict[item], accept_privacy=accepted)
-        assert not test_obj.analyse_text
        assert not test_obj.skip_extraction
        assert test_obj.subdict["filename"] == set_testdict[item]["filename"]
-    test_obj = tt.TextDetector(
-        {}, analyse_text=True, skip_extraction=True, accept_privacy=accepted
-    )
-    assert test_obj.analyse_text
+    test_obj = tt.TextDetector({}, skip_extraction=True, accept_privacy=accepted)
    assert test_obj.skip_extraction
-    with pytest.raises(ValueError):
-        tt.TextDetector({}, analyse_text=1.0, accept_privacy=accepted)
    with pytest.raises(ValueError):
        tt.TextDetector({}, skip_extraction=1.0, accept_privacy=accepted)


 def test_run_spacy(set_testdict, get_path, accepted):
-    test_obj = tt.TextDetector(
-        set_testdict["IMG_3755"], analyse_text=True, accept_privacy=accepted
-    )
+    test_obj = tt.TextDetector(set_testdict["IMG_3755"], accept_privacy=accepted)
    ref_file = get_path + "text_IMG_3755.txt"
    with open(ref_file, "r") as file:
        reference_text = file.read()
@ -108,15 +100,11 @@ def test_analyse_image(set_testdict, set_environ, accepted):
    for item in set_testdict:
        test_obj = tt.TextDetector(set_testdict[item], accept_privacy=accepted)
        test_obj.analyse_image()
-        test_obj = tt.TextDetector(
-            set_testdict[item], analyse_text=True, accept_privacy=accepted
-        )
+        test_obj = tt.TextDetector(set_testdict[item], accept_privacy=accepted)
        test_obj.analyse_image()
    testdict = {}
    testdict["text"] = 20000 * "m"
-    test_obj = tt.TextDetector(
-        testdict, skip_extraction=True, analyse_text=True, accept_privacy=accepted
-    )
+    test_obj = tt.TextDetector(testdict, skip_extraction=True, accept_privacy=accepted)
    test_obj.analyse_image()
    assert test_obj.subdict["text_truncated"] == 5000 * "m"
    assert test_obj.subdict["text"] == 20000 * "m"
--- a/ammico/text.py
+++ b/ammico/text.py
@ -67,7 +67,6 @@ class TextDetector(AnalysisMethod):
    def __init__(
        self,
        subdict: dict,
-        analyse_text: bool = False,
        skip_extraction: bool = False,
        accept_privacy: str = "PRIVACY_AMMICO",
    ) -> None:
@ -76,8 +75,6 @@ class TextDetector(AnalysisMethod):
        Args:
            subdict (dict): Dictionary containing file name/path, and possibly previous
                analysis results from other modules.
-            analyse_text (bool, optional): Decide if extracted text will be further subject
-                to analysis. Defaults to False.
            skip_extraction (bool, optional): Decide if text will be extracted from images or
                is already provided via a csv. Defaults to False.
            accept_privacy (str, optional): Environment variable to accept the privacy
@ -96,17 +93,13 @@ class TextDetector(AnalysisMethod):
                "Privacy disclosure not accepted - skipping text detection."
            )
        self.translator = Translator(raise_exception=True)
-        if not isinstance(analyse_text, bool):
-            raise ValueError("analyse_text needs to be set to true or false")
-        self.analyse_text = analyse_text
        self.skip_extraction = skip_extraction
        if not isinstance(skip_extraction, bool):
            raise ValueError("skip_extraction needs to be set to true or false")
        if self.skip_extraction:
            print("Skipping text extraction from image.")
            print("Reading text directly from provided dictionary.")
-        if self.analyse_text:
-            self._initialize_spacy()
+        self._initialize_spacy()

    def set_keys(self) -> dict:
        """Set the default keys for text analysis.
@ -183,7 +176,7 @@ class TextDetector(AnalysisMethod):
            self._truncate_text()
            self.translate_text()
            self.remove_linebreaks()
-            if self.analyse_text and self.subdict["text_english"]:
+            if self.subdict["text_english"]:
                self._run_spacy()
        return self.subdict

--- a/ammico/utils.py
+++ b/ammico/utils.py
@ -5,6 +5,9 @@ import pooch
 import importlib_resources
 import collections
 import random
+from enum import Enum
+from typing import List, Tuple, Optional, Union
+import re


 pkg = importlib_resources.files("ammico")
@ -40,6 +43,41 @@ def ammico_prefetch_models():
        res.get()


+class AnalysisType(str, Enum):
+    SUMMARY = "summary"
+    QUESTIONS = "questions"
+    SUMMARY_AND_QUESTIONS = "summary_and_questions"
+
+    @classmethod
+    def _validate_analysis_type(
+        cls,
+        analysis_type: Union["AnalysisType", str],
+        list_of_questions: Optional[List[str]],
+    ) -> Tuple[str, bool, bool]:
+        max_questions_per_image = 15  # safety cap to avoid too many questions
+        if isinstance(analysis_type, AnalysisType):
+            analysis_type = analysis_type.value
+
+        allowed = {item.value for item in AnalysisType}
+        if analysis_type not in allowed:
+            raise ValueError(f"analysis_type must be one of {allowed}")
+
+        if analysis_type in ("questions", "summary_and_questions"):
+            if not list_of_questions:
+                raise ValueError(
+                    "list_of_questions must be provided for QUESTIONS analysis type."
+                )
+
+            if len(list_of_questions) > max_questions_per_image:
+                raise ValueError(
+                    f"Number of questions per image ({len(list_of_questions)}) exceeds safety cap ({max_questions_per_image}). Reduce questions or increase max_questions_per_image."
+                )
+
+        is_summary = analysis_type in ("summary", "summary_and_questions")
+        is_questions = analysis_type in ("questions", "summary_and_questions")
+        return analysis_type, is_summary, is_questions
+
+
 class AnalysisMethod:
    """Base class to be inherited by all analysis methods."""

@ -94,6 +132,113 @@ def _limit_results(results, limit):
    return results


+def _categorize_outputs(
+    collected: List[Tuple[float, str]],
+    include_questions: bool = False,
+) -> Tuple[List[str], List[str]]:
+    """
+    Categorize collected outputs into summary bullets and VQA bullets.
+    Args:
+        collected (List[Tuple[float, str]]): List of tuples containing timestamps and generated texts.
+    Returns:
+        Tuple[List[str], List[str]]: A tuple containing two lists - summary bullets and VQA bullets.
+    """
+    MAX_CAPTIONS_FOR_SUMMARY = 600  # TODO For now, this is a constant value, but later we need to make it adjustable, with the idea of cutting out the most similar frames to reduce the load on the system.
+    caps_for_summary_vqa = (
+        collected[-MAX_CAPTIONS_FOR_SUMMARY:]
+        if len(collected) > MAX_CAPTIONS_FOR_SUMMARY
+        else collected
+    )
+    bullets_summary = []
+    bullets_vqa = []
+
+    for t, c in caps_for_summary_vqa:
+        if include_questions:
+            result_sections = c.strip()
+            m = re.search(
+                r"Summary\s*:\s*(.*?)\s*(?:VQA\s+Answers\s*:\s*(.*))?$",
+                result_sections,
+                flags=re.IGNORECASE | re.DOTALL,
+            )
+            if m:
+                summary_text = (
+                    m.group(1).replace("\n", " ").strip() if m.group(1) else None
+                )
+                vqa_text = m.group(2).strip() if m.group(2) else None
+                if not summary_text or not vqa_text:
+                    raise ValueError(
+                        f"Model output is missing either summary or VQA answers: {c}"
+                    )
+                bullets_summary.append(f"- [{t:.3f}s] {summary_text}")
+                bullets_vqa.append(f"- [{t:.3f}s] {vqa_text}")
+            else:
+                raise ValueError(
+                    f"Failed to parse summary and VQA answers from model output: {c}"
+                )
+        else:
+            snippet = c.replace("\n", " ").strip()
+            bullets_summary.append(f"- [{t:.3f}s] {snippet}")
+    return bullets_summary, bullets_vqa
+
+
+def _normalize_whitespace(s: str) -> str:
+    return re.sub(r"\s+", " ", s).strip()
+
+
+def _strip_prompt_prefix_literal(decoded: str, prompt: str) -> str:
+    """
+    Remove any literal prompt prefix from decoded text using a normalized-substring match.
+    Guarantees no prompt text remains at the start of returned string (best-effort).
+    """
+    if not decoded:
+        return ""
+    if not prompt:
+        return decoded.strip()
+
+    d_norm = _normalize_whitespace(decoded)
+    p_norm = _normalize_whitespace(prompt)
+
+    idx = d_norm.find(p_norm)
+    if idx != -1:
+        running = []
+        for i, ch in enumerate(decoded):
+            running.append(ch if not ch.isspace() else " ")
+            cur_norm = _normalize_whitespace("".join(running))
+            if cur_norm.endswith(p_norm):
+                return decoded[i + 1 :].lstrip() if i + 1 < len(decoded) else ""
+    m = re.match(
+        r"^(?:\s*(system|user|assistant)[:\s-]*\n?)+", decoded, flags=re.IGNORECASE
+    )
+    if m:
+        return decoded[m.end() :].lstrip()
+
+    return decoded.lstrip("\n\r ").lstrip(":;- ").strip()
+
+
+def find_videos(
+    path: str = None,
+    pattern=["mp4"],  # TODO: test with more video formats
+    recursive: bool = True,
+    limit=5,
+    random_seed: int = None,
+) -> dict:
+    """Find video files on the file system."""
+    if path is None:
+        path = os.environ.get("AMMICO_DATA_HOME", ".")
+    if isinstance(pattern, str):
+        pattern = [pattern]
+    results = []
+    for p in pattern:
+        results.extend(_match_pattern(path, p, recursive=recursive))
+    if len(results) == 0:
+        raise FileNotFoundError(f"No files found in {path} with pattern '{pattern}'")
+    if random_seed is not None:
+        random.seed(random_seed)
+        random.shuffle(results)
+    videos = _limit_results(results, limit)
+    return initialize_dict(videos)
+
+
 def find_files(
    path: str = None,
    pattern=["png", "jpg", "jpeg", "gif", "webp", "avif", "tiff"],
--- a/ammico/video_summary.py
+++ b/ammico/video_summary.py
@ -0,0 +1,514 @@
+import re
+import math
+import torch
+import warnings
+from PIL import Image
+from torchcodec.decoders import VideoDecoder
+
+from ammico.model import MultimodalSummaryModel
+from ammico.utils import (
+    AnalysisMethod,
+    AnalysisType,
+    _categorize_outputs,
+    _strip_prompt_prefix_literal,
+)
+
+from typing import List, Dict, Any, Generator, Tuple, Union, Optional
+from transformers import GenerationConfig
+
+
+class VideoSummaryDetector(AnalysisMethod):
+    MAX_SAMPLES_CAP = 1000  # safety cap for total extracted frames
+
+    def __init__(
+        self,
+        summary_model: MultimodalSummaryModel,
+        subdict: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """
+        Class for analysing videos using QWEN-2.5-VL model.
+        It provides methods for generating captions and answering questions about videos.
+
+        Args:
+            summary_model ([type], optional): An instance of MultimodalSummaryModel to be used for analysis.
+            subdict (dict, optional): Dictionary containing the video to be analysed. Defaults to {}.
+
+        Returns:
+            None.
+        """
+        if subdict is None:
+            subdict = {}
+
+        super().__init__(subdict)
+        self.summary_model = summary_model
+
+    def _frame_batch_generator(
+        self,
+        timestamps: torch.Tensor,
+        batch_size: int,
+        video_decoder: VideoDecoder,
+    ) -> Generator[Tuple[torch.Tensor, torch.Tensor], None, None]:
+        """
+        Yield batches of (frames, timestamps) for given frame indices.
+        - frames are returned as a torch.Tensor with shape (B, C, H, W).
+        - timestamps is a 1D torch.Tensor with B elements.
+        """
+        total = int(timestamps.numel())
+
+        for start in range(0, total, batch_size):
+            batch_secs = timestamps[start : start + batch_size].tolist()
+            fb = video_decoder.get_frames_played_at(batch_secs)
+            frames = fb.data
+
+            if not frames.is_contiguous():
+                frames = frames.contiguous()
+            pts = fb.pts_seconds
+            pts_out = (
+                pts.cpu().to(dtype=torch.float32)
+                if isinstance(pts, torch.Tensor)
+                else torch.tensor(pts, dtype=torch.float32)
+            )
+            yield frames, pts_out
+
+    def _extract_video_frames(
+        self,
+        entry: Dict[str, Any],
+        frame_rate_per_second: float = 2.0,
+        batch_size: int = 32,
+    ) -> Generator[Tuple[torch.Tensor, torch.Tensor], None, None]:
+        """
+        Extract frames from a video at a specified frame rate and return them as a generator of batches.
+        Args:
+            filename (Union[str, os.PathLike]): Path to the video file.
+            frame_rate_per_second (float, optional): Frame extraction rate in frames per second. Default is 2.
+            batch_size (int, optional): Number of frames to include in each batch. Default is 32.
+        Returns:
+            Generator[Tuple[torch.Tensor, torch.Tensor], None, None]: A generator yielding tuples of
+            (frames, timestamps), where frames is a tensor of shape (B, C, H, W) and timestamps is a 1D tensor of length B.
+        """
+
+        filename = entry.get("filename")
+        if not filename:
+            raise ValueError("entry must contain key 'filename'")
+
+        video_decoder = VideoDecoder(filename)
+        meta = video_decoder.metadata
+
+        video_fps = getattr(meta, "average_fps", None)
+        if video_fps is None or not (
+            isinstance(video_fps, (int, float)) and video_fps > 0
+        ):
+            video_fps = 30.0
+
+        begin_stream_seconds = getattr(meta, "begin_stream_seconds", None)
+        end_stream_seconds = getattr(meta, "end_stream_seconds", None)
+        nframes = len(video_decoder)
+        if getattr(meta, "duration_seconds", None) is not None:
+            duration = float(meta.duration_seconds)
+        elif begin_stream_seconds is not None and end_stream_seconds is not None:
+            duration = float(end_stream_seconds) - float(begin_stream_seconds)
+        elif nframes:
+            duration = float(nframes) / float(video_fps)
+        else:
+            duration = 0.0
+
+        if frame_rate_per_second <= 0:
+            raise ValueError("frame_rate_per_second must be > 0")
+
+        n_samples = max(1, int(math.floor(duration * frame_rate_per_second)))
+        n_samples = min(n_samples, self.MAX_SAMPLES_CAP)
+
+        if begin_stream_seconds is not None and end_stream_seconds is not None:
+            sample_times = torch.linspace(
+                float(begin_stream_seconds), float(end_stream_seconds), steps=n_samples
+            )
+            if sample_times.numel() > 1:
+                sample_times = torch.clamp(
+                    sample_times,
+                    min=float(begin_stream_seconds),
+                    max=float(end_stream_seconds) - 1e-6,
+                )
+        else:
+            sample_times = torch.linspace(0.0, max(0.0, duration), steps=n_samples)
+
+        sample_times = sample_times.to(dtype=torch.float32, device="cpu")
+        generator = self._frame_batch_generator(sample_times, batch_size, video_decoder)
+
+        return generator
+
+    def _decode_trimmed_outputs(
+        self,
+        generated_ids: torch.Tensor,
+        inputs: Dict[str, torch.Tensor],
+        tokenizer,
+        prompt_texts: List[str],
+    ) -> List[str]:
+        """
+        Trim prompt tokens using attention_mask/input_ids when available and decode to strings.
+        Then remove any literal prompt prefix using prompt_texts (one per batch element).
+        """
+
+        batch_size = generated_ids.shape[0]
+
+        if "input_ids" in inputs:
+            token_for_padding = (
+                tokenizer.pad_token_id
+                if getattr(tokenizer, "pad_token_id", None) is not None
+                else getattr(tokenizer, "eos_token_id", None)
+            )
+            if token_for_padding is None:
+                lengths = [int(inputs["input_ids"].shape[1])] * batch_size
+            else:
+                lengths = inputs["input_ids"].ne(token_for_padding).sum(dim=1).tolist()
+        else:
+            lengths = [0] * batch_size
+
+        trimmed_ids = []
+        for i in range(batch_size):
+            out_ids = generated_ids[i]
+            in_len = int(lengths[i]) if i < len(lengths) else 0
+            if out_ids.size(0) > in_len:
+                t = out_ids[in_len:]
+            else:
+                t = out_ids.new_empty((0,), dtype=out_ids.dtype)
+            t_cpu = t.to("cpu")
+            trimmed_ids.append(t_cpu.tolist())
+
+        decoded = tokenizer.batch_decode(
+            trimmed_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
+        )
+        decoded_results = []
+        for ptext, raw in zip(prompt_texts, decoded):
+            cleaned = _strip_prompt_prefix_literal(raw, ptext)
+            decoded_results.append(cleaned)
+        return decoded_results
+
+    def _generate_from_processor_inputs(
+        self,
+        processor_inputs: Dict[str, torch.Tensor],
+        prompt_texts: List[str],
+        tokenizer,
+    ):
+        """
+        Run model.generate on already-processed processor_inputs (tensors moved to device),
+        then decode and trim prompt tokens & remove literal prompt prefixes using prompt_texts.
+        """
+        gen_conf = GenerationConfig(
+            max_new_tokens=64,
+            do_sample=False,
+            num_return_sequences=1,
+        )
+
+        for k, v in list(processor_inputs.items()):
+            if isinstance(v, torch.Tensor):
+                processor_inputs[k] = v.to(self.summary_model.device)
+
+        with torch.inference_mode():
+            try:
+                if self.summary_model.device == "cuda":
+                    with torch.amp.autocast("cuda", enabled=True):
+                        generated_ids = self.summary_model.model.generate(
+                            **processor_inputs, generation_config=gen_conf
+                        )
+                else:
+                    generated_ids = self.summary_model.model.generate(
+                        **processor_inputs, generation_config=gen_conf
+                    )
+            except RuntimeError as e:
+                warnings.warn(
+                    f"Generation failed with error: {e}. Retrying with cuDNN disabled.",
+                    RuntimeWarning,
+                )
+                cudnn_was_enabled = (
+                    torch.backends.cudnn.is_available() and torch.backends.cudnn.enabled
+                )
+                if cudnn_was_enabled:
+                    torch.backends.cudnn.enabled = False
+                try:
+                    generated_ids = self.summary_model.model.generate(
+                        **processor_inputs, generation_config=gen_conf
+                    )
+                except Exception as retry_error:
+                    raise RuntimeError(
+                        f"Failed to generate ids after retry: {retry_error}"
+                    ) from retry_error
+                finally:
+                    if cudnn_was_enabled:
+                        torch.backends.cudnn.enabled = True
+
+        decoded = self._decode_trimmed_outputs(
+            generated_ids, processor_inputs, tokenizer, prompt_texts
+        )
+        return decoded
+
+    def _tensor_batch_to_pil_list(self, batch: torch.Tensor) -> List[Image.Image]:
+        """
+        Convert a uint8 torch tensor batch (B, C, H, W) on CPU to list of PIL images (RGB).
+        The conversion is done on CPU and returns PIL.Image objects.
+        """
+        if batch.device.type != "cpu":
+            batch = batch.to("cpu")
+
+        batch = batch.contiguous()
+        if batch.dtype.is_floating_point:
+            batch = (batch.clamp(0.0, 1.0) * 255.0).to(torch.uint8)
+        elif batch.dtype != torch.uint8:
+            batch = batch.to(torch.uint8)
+        pil_list: List[Image.Image] = []
+        for frame in batch:
+            arr = frame.permute(1, 2, 0).numpy()
+            pil_list.append(Image.fromarray(arr))
+        return pil_list
+
+    def make_captions_from_extracted_frames(
+        self,
+        extracted_video_gen: Generator[Tuple[torch.Tensor, torch.Tensor], None, None],
+        list_of_questions: Optional[List[str]] = None,
+    ) -> Dict[str, Any]:
+        """
+        Generate captions for all extracted frames and then produce a concise summary of the video.
+        Args:
+            extracted_video_dict (Dict[str, Any]): Dictionary containing the frame generator and number of frames.
+            summary_instruction (str, optional): Instruction for summarizing the captions. Defaults to a concise paragraph.
+        Returns:
+            Dict[str, Any]: A dictionary containing the list of captions with timestamps and the final summary.
+        """
+
+        caption_instruction = "Describe this image in one concise caption."
+        include_questions = bool(list_of_questions)
+        if include_questions:
+            q_block = "\n".join(
+                [f"{i + 1}. {q.strip()}" for i, q in enumerate(list_of_questions)]
+            )
+            caption_instruction += (
+                " In addition to the concise caption, also answer the following questions based ONLY on the image. Answers must be very brief and concise."
+                " Produce exactly two labeled sections: \n\n"
+                "Summary: <concise summary>\n\n"
+                "VQA Answers: \n1. <answer to question 1>\n2. <answer to question 2>\n etc."
+                "\nReturn only those two sections for each image (do not add extra commentary)."
+                "\nIf the answer cannot be determined based on the provided answer blocks,"
+                ' reply with the line "The answer cannot be determined based on the information provided."'
+                f"\n\nQuestions:\n{q_block}"
+            )
+        collected: List[Tuple[float, str]] = []
+        proc = self.summary_model.processor
+        try:
+            for batch_frames, batch_times in extracted_video_gen:
+                pil_list = self._tensor_batch_to_pil_list(batch_frames.cpu())
+
+                prompt_texts = []
+                for p in pil_list:
+                    messages = [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "image", "image": p},
+                                {"type": "text", "text": caption_instruction},
+                            ],
+                        }
+                    ]
+
+                    prompt_text = proc.apply_chat_template(
+                        messages, tokenize=False, add_generation_prompt=True
+                    )
+                    prompt_texts.append(prompt_text)
+
+                processor_inputs = proc(
+                    text=prompt_texts,
+                    images=pil_list,
+                    return_tensors="pt",
+                    padding=True,
+                )
+                captions = self._generate_from_processor_inputs(
+                    processor_inputs,
+                    prompt_texts,
+                    self.summary_model.tokenizer,
+                )
+
+                if isinstance(batch_times, torch.Tensor):
+                    batch_times_list = batch_times.cpu().tolist()
+                else:
+                    batch_times_list = list(batch_times)
+                for t, c in zip(batch_times_list, captions):
+                    collected.append((float(t), c))
+        finally:
+            try:
+                extracted_video_gen.close()
+            except Exception:
+                warnings.warn("Failed to close video frame generator.", RuntimeWarning)
+
+        collected.sort(key=lambda x: x[0])
+        bullets_summary, bullets_vqa = _categorize_outputs(collected, include_questions)
+
+        return {
+            "summary_bullets": bullets_summary,
+            "vqa_bullets": bullets_vqa,
+        }  # TODO consider taking out time stamps from the returned structure
+
+    def final_summary(self, summary_dict: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Produce a concise summary of the video, based on generated captions for all extracted frames.
+        Args:
+            summary_dict (Dict[str, Any]): Dictionary containing captions for the frames.
+        Returns:
+            Dict[str, Any]: A dictionary containing the list of captions with timestamps and the final summary.
+        """
+        summary_instruction = "Analyze the following captions from multiple frames of the same video and summarize the overall content of the video in one concise paragraph (1-3 sentences). Focus on the key themes, actions, or events across the video, not just the individual frames."
+        proc = self.summary_model.processor
+
+        bullets = summary_dict.get("summary_bullets", [])
+        if not bullets:
+            raise ValueError("No captions available for summary generation.")
+
+        combined_captions_text = "\n".join(bullets)
+        summary_user_text = summary_instruction + "\n\n" + combined_captions_text + "\n"
+
+        messages = [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": summary_user_text}],
+            }
+        ]
+
+        summary_prompt_text = proc.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+
+        summary_inputs = proc(
+            text=[summary_prompt_text], return_tensors="pt", padding=True
+        )
+
+        summary_inputs = {
+            k: v.to(self.summary_model.device) if isinstance(v, torch.Tensor) else v
+            for k, v in summary_inputs.items()
+        }
+        final_summary_list = self._generate_from_processor_inputs(
+            summary_inputs,
+            [summary_prompt_text],
+            self.summary_model.tokenizer,
+        )
+        final_summary = final_summary_list[0].strip() if final_summary_list else ""
+
+        return {
+            "summary": final_summary,
+        }
+
+    def final_answers(
+        self,
+        answers_dict: Dict[str, Any],
+        list_of_questions: List[str],
+    ) -> Dict[str, Any]:
+        """
+        Answer the list of questions for the video based on the VQA bullets from the frames.
+        Args:
+            answers_dict (Dict[str, Any]): Dictionary containing the VQA bullets.
+        Returns:
+            Dict[str, Any]: A dictionary containing the list of answers to the questions.
+        """
+        vqa_bullets = answers_dict.get("vqa_bullets", [])
+        if not vqa_bullets:
+            raise ValueError(
+                "No VQA bullets generated for single frames available for answering questions."
+            )
+
+        include_questions = bool(list_of_questions)
+        if include_questions:
+            q_block = "\n".join(
+                [f"{i + 1}. {q.strip()}" for i, q in enumerate(list_of_questions)]
+            )
+            prompt = (
+                "You are provided with a set of short VQA-captions, each of which is a block of short answers"
+                " extracted from individual frames of the same video.\n\n"
+                "VQA-captions (use ONLY these to answer):\n"
+                f"{vqa_bullets}\n\n"
+                "Answer the following questions briefly, based ONLY on the lists of answers provided above. The VQA-captions above contain answers"
+                " to the same questions you are about to answer. If the answer cannot be determined based on the provided answer blocks,"
+                ' reply with the line "The answer cannot be determined based on the information provided."'
+                "Questions:\n"
+                f"{q_block}\n\n"
+                "Produce an ordered list with answers in the same order as the questions. You must have this structure of your output: "
+                "Answers: \n1. <answer to question 1>\n2. <answer to question 2>\n etc."
+                "Return ONLY the ordered list with answers and NOTHING else — no commentary, no explanation, no surrounding markdown."
+            )
+        else:
+            raise ValueError(
+                "list_of_questions must be provided for making final answers."
+            )
+
+        proc = self.summary_model.processor
+        messages = [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": prompt}],
+            }
+        ]
+        final_vqa_prompt_text = proc.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        final_vqa_inputs = proc(
+            text=[final_vqa_prompt_text], return_tensors="pt", padding=True
+        )
+        final_vqa_inputs = {
+            k: v.to(self.summary_model.device) if isinstance(v, torch.Tensor) else v
+            for k, v in final_vqa_inputs.items()
+        }
+
+        final_vqa_list = self._generate_from_processor_inputs(
+            final_vqa_inputs,
+            [final_vqa_prompt_text],
+            self.summary_model.tokenizer,
+        )
+
+        final_vqa_output = final_vqa_list[0].strip() if final_vqa_list else ""
+        vqa_answers = []
+        answer_matches = re.findall(
+            r"\d+\.\s*(.*?)(?=\n\d+\.|$)", final_vqa_output, flags=re.DOTALL
+        )
+        for answer in answer_matches:
+            vqa_answers.append(answer.strip())
+        return {
+            "vqa_answers": vqa_answers,
+        }
+
+    def analyse_videos_from_dict(
+        self,
+        analysis_type: Union[AnalysisType, str] = AnalysisType.SUMMARY,
+        frame_rate_per_second: float = 2.0,
+        list_of_questions: Optional[List[str]] = None,
+    ) -> Dict[str, Any]:
+        """
+        Analyse the video specified in self.subdict using frame extraction and captioning.
+        Args:
+            analysis_type (Union[AnalysisType, str], optional): Type of analysis to perform. Defaults to AnalysisType.SUMMARY.
+            frame_rate_per_second (float): Frame extraction rate in frames per second. Default is 2.0.
+            list_of_questions (List[str], optional): List of questions to answer about the video. Required if analysis_type includes questions.
+        Returns:
+            Dict[str, Any]: A dictionary containing the analysis results, including summary and answers for provided questions(if any).
+        """
+
+        all_answers = {}
+        analysis_type, is_summary, is_questions = AnalysisType._validate_analysis_type(
+            analysis_type, list_of_questions
+        )
+
+        for video_key, entry in self.subdict.items():
+            summary_and_vqa = {}
+            frames_generator = self._extract_video_frames(
+                entry, frame_rate_per_second=frame_rate_per_second
+            )
+
+            answers_dict = self.make_captions_from_extracted_frames(
+                frames_generator, list_of_questions=list_of_questions
+            )
+            # TODO: captions has to be post-processed with foreseeing audio analysis
+            # TODO: captions and answers may lead to prompt, that superior model limits. Consider hierarchical approach.
+            if is_summary:
+                answer = self.final_summary(answers_dict)
+                summary_and_vqa["summary"] = answer["summary"]
+            if is_questions:
+                answer = self.final_answers(answers_dict, list_of_questions)
+                summary_and_vqa["vqa_answers"] = answer["vqa_answers"]
+
+            all_answers[video_key] = summary_and_vqa
+
+        return all_answers
--- a/environment.yml
+++ b/environment.yml
@ -0,0 +1,20 @@
+name: ammico-dev
+channels:
+  - pytorch
+  - nvidia
+  - rapidsai
+  - conda-forge
+  - defaults
+
+dependencies:
+  - python=3.11
+  - cudatoolkit=11.8
+  - pytorch=2.5.1 
+  - pytorch-cuda=11.8 
+  - torchvision=0.20.1
+  - torchaudio=2.5.1
+  - faiss-gpu-raft=1.8.0
+  - ipykernel
+  - jupyterlab
+  - jupyterlab_widgets
+  - ffmpeg<8
--- a/pyproject.toml
+++ b/pyproject.toml
@ -18,16 +18,19 @@ classifiers = [
    "Operating System :: OS Independent",
    "License :: OSI Approved :: MIT License",
 ]
-
+ 
 dependencies = [
+    "accelerate>=0.22",
+    "bitsandbytes",
    "colorgram.py",
    "colour-science",
    "dash",
    "dash-bootstrap-components",
    "deepface",
    "google-cloud-vision",
-    "googletrans==4.0.0rc1",
+    "googletrans-py", # instead of googletrans4.0.0rc1, for a temporary solution due to incompatibility with jupyterlab
    "grpcio",
+    "huggingface-hub>=0.34.0",
    "importlib_metadata",
    "importlib_resources",
    "matplotlib",
@ -36,12 +39,17 @@ dependencies = [
    "pandas",
    "Pillow",
    "pooch",
+    "qwen-vl-utils",
    "retina_face",
+    "safetensors>=0.6.2",
    "setuptools",
    "spacy",
-    "tensorflow<=2.16.0",
+    "tensorflow<2.15", # instead of <=2.16.0 to make it compatible with CUDA 11.8, may change after updating CUDA version.
    "tf-keras",
+    "torchvision",
    "tqdm",
+    "transformers>=4.54",
+    "torchcodec<0.2",
    "webcolors",
 ]