From 34afed53755d5e13945e036e5ccfc49d8c042824 Mon Sep 17 00:00:00 2001 From: DimasfromLavoisier Date: Fri, 12 Sep 2025 15:12:07 +0200 Subject: [PATCH 01/31] add new dependencies for upcoming models --- environment.yml | 19 +++++++++++++++++++ pyproject.toml | 13 ++++++++++--- 2 files changed, 29 insertions(+), 3 deletions(-) create mode 100644 environment.yml diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..2268e08 --- /dev/null +++ b/environment.yml @@ -0,0 +1,19 @@ +name: ammico-dev +channels: + - pytorch + - nvidia + - rapidsai + - conda-forge + - defaults + +dependencies: + - python=3.11 + - cudatoolkit=11.8 + - pytorch=2.3.1 + - pytorch-cuda=11.8 + - torchvision=0.18.1 + - torchaudio=2.3.1 + - faiss-gpu-raft=1.8.0 + - ipykernel + - jupyterlab + - jupyterlab_widgets \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 4cad313..1507f8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,16 +18,20 @@ classifiers = [ "Operating System :: OS Independent", "License :: OSI Approved :: MIT License", ] - + dependencies = [ + "accelerate>=0.22", + "bitsandbytes", "colorgram.py", "colour-science", "dash", "dash-bootstrap-components", + "decord", "deepface", "google-cloud-vision", - "googletrans==4.0.0rc1", + "googletrans-py", # instead of googletrans4.0.0rc1, for a temporary solution due the incompatibility with jupyterlab "grpcio", + "huggingface-hub>=0.34.0", "importlib_metadata", "importlib_resources", "matplotlib", @@ -36,12 +40,15 @@ dependencies = [ "pandas", "Pillow", "pooch", + "qwen-vl-utils[decord]==0.0.8", "retina_face", + "safetensors>=0.6.2", "setuptools", "spacy", - "tensorflow<=2.16.0", + "tensorflow<2.15", # instead of <=2.16.0 to make it compatible with CUDA 11.8, may change after updating CUDA version. "tf-keras", "tqdm", + "transformers>=4.54", "webcolors", ] From 5583bbed08f48c133b01e4b0dcbdb528dc6f38a3 Mon Sep 17 00:00:00 2001 From: DimasfromLavoisier Date: Fri, 22 Aug 2025 15:43:38 +0200 Subject: [PATCH 02/31] add Model class --- ammico/__init__.py | 2 + ammico/model.py | 111 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 ammico/model.py diff --git a/ammico/__init__.py b/ammico/__init__.py index 67a7065..9a25ade 100644 --- a/ammico/__init__.py +++ b/ammico/__init__.py @@ -1,5 +1,6 @@ from ammico.display import AnalysisExplorer from ammico.faces import EmotionDetector, ethical_disclosure +from ammico.model import MultimodalSummaryModel from ammico.text import TextDetector, TextAnalyzer, privacy_disclosure from ammico.utils import find_files, get_dataframe @@ -14,6 +15,7 @@ except ImportError: __all__ = [ "AnalysisExplorer", "EmotionDetector", + "MultimodalSummaryModel", "TextDetector", "TextAnalyzer", "find_files", diff --git a/ammico/model.py b/ammico/model.py new file mode 100644 index 0000000..80cc31f --- /dev/null +++ b/ammico/model.py @@ -0,0 +1,111 @@ +import torch +import warnings +from transformers import ( + Qwen2_5_VLForConditionalGeneration, + AutoProcessor, + BitsAndBytesConfig, + AutoTokenizer, +) +from typing import Optional + + +class MultimodalSummaryModel: + DEFAULT_CUDA_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct" + DEFAULT_CPU_MODEL = "Qwen/Qwen2.5-VL-3B-Instruct" + + def __init__( + self, + model_id: Optional[str] = None, + device: Optional[str] = None, + cache_dir: Optional[str] = None, + ) -> None: + """ + Class for QWEN-2.5-VL model loading and inference. + Args: + model_id: Type of model to load, defaults to a smaller version for CPU if device is "cpu". + device: "cuda" or "cpu" (auto-detected when None). + cache_dir: huggingface cache dir (optional). + """ + self.device = self._resolve_device(device) + self.model_id = model_id or ( + self.DEFAULT_CUDA_MODEL if self.device == "cuda" else self.DEFAULT_CPU_MODEL + ) + + self.cache_dir = cache_dir + self._trust_remote_code = True + self._quantize = True + + self.model = None + self.processor = None + self.tokenizer = None + + self._load_model_and_processor() + + @staticmethod + def _resolve_device(device: Optional[str]) -> str: + if device is None: + return "cuda" if torch.cuda.is_available() else "cpu" + if device.lower() not in ("cuda", "cpu"): + raise ValueError("device must be 'cuda' or 'cpu'") + if device.lower() == "cuda" and not torch.cuda.is_available(): + warnings.warn( + "Although 'cuda' was requested, no CUDA device is available. Using CPU instead.", + RuntimeWarning, + stacklevel=2, + ) + return "cpu" + return device.lower() + + def _load_model_and_processor(self): + load_kwargs = {"trust_remote_code": self._trust_remote_code, "use_cache": True} + if self.cache_dir: + load_kwargs["cache_dir"] = self.cache_dir + + self.processor = AutoProcessor.from_pretrained( + self.model_id, padding_side="left", **load_kwargs + ) + self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, **load_kwargs) + + if self.device == "cuda": + compute_dtype = ( + torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 + ) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=compute_dtype, + ) + load_kwargs["quantization_config"] = bnb_config + load_kwargs["device_map"] = "auto" + + else: + load_kwargs.pop("quantization_config", None) + load_kwargs.pop("device_map", None) + + self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained( + self.model_id, **load_kwargs + ) + self.model.eval() + + def _close(self) -> None: + """Free model resources (helpful in long-running processes).""" + try: + if self.model is not None: + del self.model + self.model = None + finally: + try: + if torch.cuda.is_available(): + torch.cuda.empty_cache() + except Exception as e: + warnings.warn( + "Failed to empty CUDA cache. This is not critical, but may lead to memory lingering: " + f"{e!r}", + RuntimeWarning, + stacklevel=2, + ) + + def close(self) -> None: + """Free model resources (helpful in long-running processes).""" + self._close() From bd63be469392a593342cf518531b00e283c8dbbd Mon Sep 17 00:00:00 2001 From: DimasfromLavoisier Date: Fri, 22 Aug 2025 15:43:38 +0200 Subject: [PATCH 03/31] add Model class --- ammico/__init__.py | 2 + ammico/model.py | 111 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 ammico/model.py diff --git a/ammico/__init__.py b/ammico/__init__.py index 67a7065..9a25ade 100644 --- a/ammico/__init__.py +++ b/ammico/__init__.py @@ -1,5 +1,6 @@ from ammico.display import AnalysisExplorer from ammico.faces import EmotionDetector, ethical_disclosure +from ammico.model import MultimodalSummaryModel from ammico.text import TextDetector, TextAnalyzer, privacy_disclosure from ammico.utils import find_files, get_dataframe @@ -14,6 +15,7 @@ except ImportError: __all__ = [ "AnalysisExplorer", "EmotionDetector", + "MultimodalSummaryModel", "TextDetector", "TextAnalyzer", "find_files", diff --git a/ammico/model.py b/ammico/model.py new file mode 100644 index 0000000..80cc31f --- /dev/null +++ b/ammico/model.py @@ -0,0 +1,111 @@ +import torch +import warnings +from transformers import ( + Qwen2_5_VLForConditionalGeneration, + AutoProcessor, + BitsAndBytesConfig, + AutoTokenizer, +) +from typing import Optional + + +class MultimodalSummaryModel: + DEFAULT_CUDA_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct" + DEFAULT_CPU_MODEL = "Qwen/Qwen2.5-VL-3B-Instruct" + + def __init__( + self, + model_id: Optional[str] = None, + device: Optional[str] = None, + cache_dir: Optional[str] = None, + ) -> None: + """ + Class for QWEN-2.5-VL model loading and inference. + Args: + model_id: Type of model to load, defaults to a smaller version for CPU if device is "cpu". + device: "cuda" or "cpu" (auto-detected when None). + cache_dir: huggingface cache dir (optional). + """ + self.device = self._resolve_device(device) + self.model_id = model_id or ( + self.DEFAULT_CUDA_MODEL if self.device == "cuda" else self.DEFAULT_CPU_MODEL + ) + + self.cache_dir = cache_dir + self._trust_remote_code = True + self._quantize = True + + self.model = None + self.processor = None + self.tokenizer = None + + self._load_model_and_processor() + + @staticmethod + def _resolve_device(device: Optional[str]) -> str: + if device is None: + return "cuda" if torch.cuda.is_available() else "cpu" + if device.lower() not in ("cuda", "cpu"): + raise ValueError("device must be 'cuda' or 'cpu'") + if device.lower() == "cuda" and not torch.cuda.is_available(): + warnings.warn( + "Although 'cuda' was requested, no CUDA device is available. Using CPU instead.", + RuntimeWarning, + stacklevel=2, + ) + return "cpu" + return device.lower() + + def _load_model_and_processor(self): + load_kwargs = {"trust_remote_code": self._trust_remote_code, "use_cache": True} + if self.cache_dir: + load_kwargs["cache_dir"] = self.cache_dir + + self.processor = AutoProcessor.from_pretrained( + self.model_id, padding_side="left", **load_kwargs + ) + self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, **load_kwargs) + + if self.device == "cuda": + compute_dtype = ( + torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 + ) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=compute_dtype, + ) + load_kwargs["quantization_config"] = bnb_config + load_kwargs["device_map"] = "auto" + + else: + load_kwargs.pop("quantization_config", None) + load_kwargs.pop("device_map", None) + + self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained( + self.model_id, **load_kwargs + ) + self.model.eval() + + def _close(self) -> None: + """Free model resources (helpful in long-running processes).""" + try: + if self.model is not None: + del self.model + self.model = None + finally: + try: + if torch.cuda.is_available(): + torch.cuda.empty_cache() + except Exception as e: + warnings.warn( + "Failed to empty CUDA cache. This is not critical, but may lead to memory lingering: " + f"{e!r}", + RuntimeWarning, + stacklevel=2, + ) + + def close(self) -> None: + """Free model resources (helpful in long-running processes).""" + self._close() From d20c4d68e4d1b2eacf59bb6ad8c1f03623731f80 Mon Sep 17 00:00:00 2001 From: DimasfromLavoisier Date: Fri, 12 Sep 2025 17:48:57 +0200 Subject: [PATCH 04/31] vqa --- ammico/image_summary.py | 343 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 343 insertions(+) create mode 100644 ammico/image_summary.py diff --git a/ammico/image_summary.py b/ammico/image_summary.py new file mode 100644 index 0000000..c4b2444 --- /dev/null +++ b/ammico/image_summary.py @@ -0,0 +1,343 @@ +from ammico.utils import AnalysisMethod, AnalysisType +from ammico.model import MultimodalSummaryModel + +import os +import torch +from PIL import Image +import warnings + +from typing import List, Optional, Union, Dict, Any +from collections.abc import Sequence as _Sequence +from transformers import GenerationConfig +import re +from qwen_vl_utils import process_vision_info + + +class ImageSummaryDetector(AnalysisMethod): + def __init__( + self, + summary_model: MultimodalSummaryModel, + subdict: dict = {}, + ) -> None: + """ + Class for analysing images using QWEN-2.5-VL model. + It provides methods for generating captions and answering questions about images. + + Args: + summary_model ([type], optional): An instance of MultimodalSummaryModel to be used for analysis. + subdict (dict, optional): Dictionary containing the image to be analysed. Defaults to {}. + + Returns: + None. + """ + + super().__init__(subdict) + self.summary_model = summary_model + + def _load_pil_if_needed( + self, filename: Union[str, os.PathLike, Image.Image] + ) -> Image.Image: + if isinstance(filename, (str, os.PathLike)): + return Image.open(filename).convert("RGB") + elif isinstance(filename, Image.Image): + return filename.convert("RGB") + else: + raise ValueError("filename must be a path or PIL.Image") + + @staticmethod + def _is_sequence_but_not_str(obj: Any) -> bool: + """True for sequence-like but not a string/bytes/PIL.Image.""" + return isinstance(obj, _Sequence) and not isinstance( + obj, (str, bytes, Image.Image) + ) + + def _prepare_inputs( + self, list_of_questions: list[str], entry: Optional[Dict[str, Any]] = None + ) -> Dict[str, torch.Tensor]: + filename = entry.get("filename") + if filename is None: + raise ValueError("entry must contain key 'filename'") + + if isinstance(filename, (str, os.PathLike, Image.Image)): + images_context = self._load_pil_if_needed(filename) + elif self._is_sequence_but_not_str(filename): + images_context = [self._load_pil_if_needed(i) for i in filename] + else: + raise ValueError( + "Unsupported 'filename' entry: expected path, PIL.Image, or sequence." + ) + + images_only_messages = [ + { + "role": "user", + "content": [ + *( + [{"type": "image", "image": img} for img in images_context] + if isinstance(images_context, list) + else [{"type": "image", "image": images_context}] + ) + ], + } + ] + + try: + image_inputs, _ = process_vision_info(images_only_messages) + except Exception as e: + raise RuntimeError(f"Image processing failed: {e}") + + texts: List[str] = [] + for q in list_of_questions: + messages = [ + { + "role": "user", + "content": [ + *( + [ + {"type": "image", "image": image} + for image in images_context + ] + if isinstance(images_context, list) + else [{"type": "image", "image": images_context}] + ), + {"type": "text", "text": q}, + ], + } + ] + text = self.summary_model.processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + texts.append(text) + + images_batch = [image_inputs] * len(texts) + inputs = self.summary_model.processor( + text=texts, + images=images_batch, + padding=True, + return_tensors="pt", + ) + inputs = {k: v.to(self.summary_model.device) for k, v in inputs.items()} + + return inputs + + def analyse_images( + self, + analysis_type: Union[AnalysisType, str] = AnalysisType.SUMMARY_AND_QUESTIONS, + list_of_questions: Optional[List[str]] = None, + max_questions_per_image: int = 32, + keys_batch_size: int = 16, + is_concise_summary: bool = True, + is_concise_answer: bool = True, + ) -> Dict[str, dict]: + """ + Analyse image with model. + + Args: + analysis_type (str): type of the analysis. + list_of_questions (list[str]): list of questions. + max_questions_per_image (int): maximum number of questions per image. We recommend to keep it low to avoid long processing times and high memory usage. + keys_batch_size (int): number of images to process in a batch. + is_concise_summary (bool): whether to generate concise summary. + is_concise_answer (bool): whether to generate concise answers. + Returns: + self.subdict (dict): dictionary with analysis results. + """ + # TODO: add option to ask multiple questions per image as one batch. + if isinstance(analysis_type, AnalysisType): + analysis_type = analysis_type.value + + allowed = {"summary", "questions", "summary_and_questions"} + if analysis_type not in allowed: + raise ValueError(f"analysis_type must be one of {allowed}") + + if list_of_questions is None: + list_of_questions = [ + "Are there people in the image?", + "What is this picture about?", + ] + + keys = list(self.subdict.keys()) + for batch_start in range(0, len(keys), keys_batch_size): + batch_keys = keys[batch_start : batch_start + keys_batch_size] + for key in batch_keys: + entry = self.subdict[key] + if analysis_type in ("summary", "summary_and_questions"): + try: + caps = self.generate_caption( + entry, + num_return_sequences=1, + is_concise_summary=is_concise_summary, + ) + entry["caption"] = caps[0] if caps else "" + except Exception as e: + warnings.warn( + "Caption generation failed for key %s: %s", key, e + ) + + if analysis_type in ("questions", "summary_and_questions"): + if len(list_of_questions) > max_questions_per_image: + raise ValueError( + f"Number of questions per image ({len(list_of_questions)}) exceeds safety cap ({max_questions_per_image})." + " Reduce questions or increase max_questions_per_image." + ) + try: + vqa_map = self.answer_questions( + list_of_questions, entry, is_concise_answer + ) + entry["vqa"] = vqa_map + except Exception as e: + warnings.warn("VQA failed for key %s: %s", key, e) + + self.subdict[key] = entry + return self.subdict + + def generate_caption( + self, + entry: Optional[Dict[str, Any]] = None, + num_return_sequences: int = 1, + is_concise_summary: bool = True, + ) -> List[str]: + """ + Create caption for image. Depending on is_concise_summary it will be either concise or detailed. + + Args: + entry (dict): dictionary containing the image to be captioned. + num_return_sequences (int): number of captions to generate. + is_concise_summary (bool): whether to generate concise summary. + + Returns: + results (list[str]): list of generated captions. + """ + if is_concise_summary: + prompt = ["Describe this image in one concise caption."] + max_new_tokens = 64 + else: + prompt = ["Describe this image."] + max_new_tokens = 256 + inputs = self._prepare_inputs(prompt, entry) + + gen_conf = GenerationConfig( + max_new_tokens=max_new_tokens, + do_sample=False, + num_return_sequences=num_return_sequences, + ) + + with torch.inference_mode(): + try: + if self.summary_model.device == "cuda": + with torch.cuda.amp.autocast(enabled=True): + generated_ids = self.summary_model.model.generate( + **inputs, generation_config=gen_conf + ) + else: + generated_ids = self.summary_model.model.generate( + **inputs, generation_config=gen_conf + ) + except RuntimeError as e: + warnings.warn( + "Retry without autocast failed: %s. Attempting cudnn-disabled retry.", + e, + ) + cudnn_was_enabled = ( + torch.backends.cudnn.is_available() and torch.backends.cudnn.enabled + ) + if cudnn_was_enabled: + torch.backends.cudnn.enabled = False + try: + generated_ids = self.summary_model.model.generate( + **inputs, generation_config=gen_conf + ) + except Exception as retry_error: + raise RuntimeError( + f"Failed to generate ids after retry: {retry_error}" + ) from retry_error + finally: + if cudnn_was_enabled: + torch.backends.cudnn.enabled = True + + decoded = None + if "input_ids" in inputs: + in_ids = inputs["input_ids"] + trimmed = [ + out_ids[len(inp_ids) :] + for inp_ids, out_ids in zip(in_ids, generated_ids) + ] + decoded = self.summary_model.tokenizer.batch_decode( + trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + else: + decoded = self.summary_model.tokenizer.batch_decode( + generated_ids, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + ) + + results = [d.strip() for d in decoded] + return results + + def answer_questions( + self, + list_of_questions: list[str], + entry: Optional[Dict[str, Any]] = None, + is_concise_answer: bool = True, + ) -> List[str]: + """ + Create answers for list of questions about image. + Args: + list_of_questions (list[str]): list of questions. + entry (dict): dictionary containing the image to be captioned. + is_concise_answer (bool): whether to generate concise answers. + Returns: + answers (list[str]): list of answers. + """ + if is_concise_answer: + gen_conf = GenerationConfig(max_new_tokens=64, do_sample=False) + for i in range(len(list_of_questions)): + if not list_of_questions[i].strip().endswith("?"): + list_of_questions[i] = list_of_questions[i].strip() + "?" + if not list_of_questions[i].lower().startswith("answer concisely"): + list_of_questions[i] = "Answer concisely: " + list_of_questions[i] + else: + gen_conf = GenerationConfig(max_new_tokens=128, do_sample=False) + + question_chunk_size = 8 + answers: List[str] = [] + n = len(list_of_questions) + for i in range(0, n, question_chunk_size): + chunk = list_of_questions[i : i + question_chunk_size] + inputs = self._prepare_inputs(chunk, entry) + with torch.inference_mode(): + if self.summary_model.device == "cuda": + with torch.cuda.amp.autocast(enabled=True): + out_ids = self.summary_model.model.generate( + **inputs, generation_config=gen_conf + ) + else: + out_ids = self.summary_model.model.generate( + **inputs, generation_config=gen_conf + ) + + if "input_ids" in inputs: + in_ids = inputs["input_ids"] + trimmed_batch = [ + out_row[len(inp_row) :] for inp_row, out_row in zip(in_ids, out_ids) + ] + decoded = self.summary_model.tokenizer.batch_decode( + trimmed_batch, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + ) + else: + decoded = self.summary_model.tokenizer.batch_decode( + out_ids, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + ) + + answers.extend([d.strip() for d in decoded]) + + if len(answers) != len(list_of_questions): + raise ValueError( + f"Expected {len(list_of_questions)} answers, but got {len(answers)}, try vary amount of questions" + ) + + return answers From 2326aef4b561e360918c5fa78a5cf6c322517df5 Mon Sep 17 00:00:00 2001 From: Dmitrii Kapitan Date: Mon, 22 Sep 2025 16:40:02 +0200 Subject: [PATCH 05/31] Add example notebook and small fixes --- ammico/__init__.py | 2 + ammico/image_summary.py | 1 - ammico/notebooks/DemoImageSummaryVQA.ipynb | 190 +++++++++++++++++++++ ammico/utils.py | 9 + 4 files changed, 201 insertions(+), 1 deletion(-) create mode 100644 ammico/notebooks/DemoImageSummaryVQA.ipynb diff --git a/ammico/__init__.py b/ammico/__init__.py index 9a25ade..1bf343d 100644 --- a/ammico/__init__.py +++ b/ammico/__init__.py @@ -2,6 +2,7 @@ from ammico.display import AnalysisExplorer from ammico.faces import EmotionDetector, ethical_disclosure from ammico.model import MultimodalSummaryModel from ammico.text import TextDetector, TextAnalyzer, privacy_disclosure +from ammico.image_summary import ImageSummaryDetector from ammico.utils import find_files, get_dataframe # Export the version defined in project metadata @@ -18,6 +19,7 @@ __all__ = [ "MultimodalSummaryModel", "TextDetector", "TextAnalyzer", + "ImageSummaryDetector", "find_files", "get_dataframe", "ethical_disclosure", diff --git a/ammico/image_summary.py b/ammico/image_summary.py index c4b2444..0cdaebe 100644 --- a/ammico/image_summary.py +++ b/ammico/image_summary.py @@ -9,7 +9,6 @@ import warnings from typing import List, Optional, Union, Dict, Any from collections.abc import Sequence as _Sequence from transformers import GenerationConfig -import re from qwen_vl_utils import process_vision_info diff --git a/ammico/notebooks/DemoImageSummaryVQA.ipynb b/ammico/notebooks/DemoImageSummaryVQA.ipynb new file mode 100644 index 0000000..b067e1f --- /dev/null +++ b/ammico/notebooks/DemoImageSummaryVQA.ipynb @@ -0,0 +1,190 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": {}, + "source": [ + "# Image summary and visual question answering" + ] + }, + { + "cell_type": "markdown", + "id": "1", + "metadata": {}, + "source": [ + "This notebook shows how to generate image captions and use the visual question answering with AMMICO. \n", + "\n", + "The first cell imports `ammico`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": {}, + "outputs": [], + "source": [ + "import ammico" + ] + }, + { + "cell_type": "markdown", + "id": "3", + "metadata": {}, + "source": [ + "The cell below loads the model for VQA tasks. By default, it loads a large model on the GPU (if your device supports CUDA), otherwise it loads a relatively smaller model on the CPU. But you can specify other settings (e.g., a small model on the GPU) if you want." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": {}, + "outputs": [], + "source": [ + "model = ammico.MultimodalSummaryModel()" + ] + }, + { + "cell_type": "markdown", + "id": "5", + "metadata": {}, + "source": [ + "Here you need to provide the path to your google drive folder or local folder containing the images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6", + "metadata": {}, + "outputs": [], + "source": [ + "image_dict = ammico.find_files(\n", + " path=str(\"/insert/your/path/here/\"),\n", + " limit=-1, # -1 means no limit on the number of files, by default it is set to 20\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "7", + "metadata": {}, + "source": [ + "The cell below creates an object that analyzes images and generates a summary using a specific model and image data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8", + "metadata": {}, + "outputs": [], + "source": [ + "img = ammico.ImageSummaryDetector(summary_model=model, subdict=image_dict)" + ] + }, + { + "cell_type": "markdown", + "id": "9", + "metadata": {}, + "source": [ + "## Image summary " + ] + }, + { + "cell_type": "markdown", + "id": "10", + "metadata": {}, + "source": [ + "To start your work with images, you should call the `analyse_images` method.\n", + "\n", + "You can specify what kind of analysis you want to perform with `analysis_type`. `\"summary\"` will generate a summary for all pictures in your dictionary, `\"questions\"` will prepare answers to your questions for all pictures, and `\"summary_and_questions\"` will do both.\n", + "\n", + "Parameter `\"is_concise_summary\"` regulates the length of an answer.\n", + "\n", + "Here we want to get a long summary on each object in our image dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": {}, + "outputs": [], + "source": [ + "summaries = img.analyse_images(analysis_type=\"summary\", is_concise_summary=False)" + ] + }, + { + "cell_type": "markdown", + "id": "12", + "metadata": {}, + "source": [ + "## VQA" + ] + }, + { + "cell_type": "markdown", + "id": "13", + "metadata": {}, + "source": [ + "In addition to analyzing images in `ammico`, the same model can be used in VQA mode. To do this, you need to define the questions that will be applied to all images from your dict." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14", + "metadata": {}, + "outputs": [], + "source": [ + "questions = [\"Are there any visible signs of violence?\", \"Is it safe to be there?\"]" + ] + }, + { + "cell_type": "markdown", + "id": "15", + "metadata": {}, + "source": [ + "Here is an example of VQA mode usage. You can specify whether you want to receive short answers (recommended option) or not." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16", + "metadata": {}, + "outputs": [], + "source": [ + "vqa_results = img.analyse_images(\n", + " analysis_type=\"questions\",\n", + " list_of_questions=questions,\n", + " is_concise_answer=True,\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ammico-dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ammico/utils.py b/ammico/utils.py index 39a0ecb..38f4144 100644 --- a/ammico/utils.py +++ b/ammico/utils.py @@ -7,6 +7,9 @@ import collections import random +from enum import Enum + + pkg = importlib_resources.files("ammico") @@ -40,6 +43,12 @@ def ammico_prefetch_models(): res.get() +class AnalysisType(str, Enum): + SUMMARY = "summary" + QUESTIONS = "questions" + SUMMARY_AND_QUESTIONS = "summary_and_questions" + + class AnalysisMethod: """Base class to be inherited by all analysis methods.""" From 0f6f9026cd0dc74e899e4e077e6cafb2606e2a77 Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Thu, 25 Sep 2025 12:45:14 +0200 Subject: [PATCH 06/31] fix: missing dependency, obsolete keyword, dash maintenance, demo notebook for new summary --- ammico/display.py | 25 ++++-------------- ammico/notebooks/DemoNotebook_ammico.ipynb | 30 +++++++++------------- ammico/text.py | 11 ++------ pyproject.toml | 1 + 4 files changed, 20 insertions(+), 47 deletions(-) diff --git a/ammico/display.py b/ammico/display.py index c80d0db..2e7c9e4 100644 --- a/ammico/display.py +++ b/ammico/display.py @@ -94,7 +94,6 @@ class AnalysisExplorer: State("left_select_id", "options"), State("left_select_id", "value"), State("Dropdown_select_Detector", "value"), - State("setting_Text_analyse_text", "value"), State("setting_privacy_env_var", "value"), State("setting_Emotion_emotion_threshold", "value"), State("setting_Emotion_race_threshold", "value"), @@ -157,14 +156,6 @@ class AnalysisExplorer: id="settings_TextDetector", style={"display": "none"}, children=[ - dbc.Row( - dcc.Checklist( - ["Analyse text"], - ["Analyse text"], - id="setting_Text_analyse_text", - style={"margin-bottom": "10px"}, - ), - ), # row 1 dbc.Row( dbc.Col( @@ -344,7 +335,7 @@ class AnalysisExplorer: port (int, optional): The port number to run the server on (default: 8050). """ - self.app.run_server(debug=True, port=port) + self.app.run(debug=True, port=port) # Dash callbacks def update_picture(self, img_path: str): @@ -375,16 +366,15 @@ class AnalysisExplorer: } if setting_input == "TextDetector": - return display_flex, display_none, display_none, display_none + return display_flex, display_none, display_none if setting_input == "EmotionDetector": - return display_none, display_flex, display_none, display_none - + return display_none, display_flex, display_none if setting_input == "ColorDetector": - return display_none, display_none, display_flex, display_none + return display_none, display_none, display_flex else: - return display_none, display_none, display_none, display_none + return display_none, display_none, display_none def _right_output_analysis( self, @@ -392,7 +382,6 @@ class AnalysisExplorer: all_img_options: dict, current_img_value: str, detector_value: str, - settings_text_analyse_text: list, setting_privacy_env_var: str, setting_emotion_emotion_threshold: int, setting_emotion_race_threshold: int, @@ -426,12 +415,8 @@ class AnalysisExplorer: identify_function = identify_dict[detector_value] if detector_value == "TextDetector": - analyse_text = ( - True if settings_text_analyse_text == ["Analyse text"] else False - ) detector_class = identify_function( image_copy, - analyse_text=analyse_text, accept_privacy=( setting_privacy_env_var if setting_privacy_env_var diff --git a/ammico/notebooks/DemoNotebook_ammico.ipynb b/ammico/notebooks/DemoNotebook_ammico.ipynb index fc8fe22..e17860c 100644 --- a/ammico/notebooks/DemoNotebook_ammico.ipynb +++ b/ammico/notebooks/DemoNotebook_ammico.ipynb @@ -104,7 +104,8 @@ "import ammico\n", "\n", "# for displaying a progress bar\n", - "from tqdm import tqdm" + "from tqdm import tqdm\n", + "import os" ] }, { @@ -140,7 +141,9 @@ "metadata": {}, "outputs": [], "source": [ - "# os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = \"/content/drive/MyDrive/misinformation-data/misinformation-campaign-981aa55a3b13.json\"" + "os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = (\n", + " \"/home/inga/projects/misinformation-project/misinformation-notes/misinformation-campaign-981aa55a3b13.json\"\n", + ")" ] }, { @@ -171,6 +174,7 @@ "metadata": {}, "outputs": [], "source": [ + "data_path = \"./data-test\"\n", "image_dict = ammico.find_files(\n", " # path = \"/content/drive/MyDrive/misinformation-data/\",\n", " path=str(data_path),\n", @@ -337,7 +341,7 @@ " enumerate(image_dict.keys()), total=len(image_dict)\n", "): # loop through all images\n", " image_dict[key] = ammico.TextDetector(\n", - " image_dict[key], analyse_text=True\n", + " image_dict[key]\n", " ).analyse_image() # analyse image with EmotionDetector and update dict\n", "\n", " if (\n", @@ -361,23 +365,12 @@ "outputs": [], "source": [ "# initialize the models\n", - "image_summary_detector = ammico.SummaryDetector(\n", - " subdict=image_dict, analysis_type=\"summary\", model_type=\"base\"\n", + "model = ammico.MultimodalSummaryModel()\n", + "image_summary_detector = ammico.ImageSummaryDetector(\n", + " subdict=image_dict, summary_model=model\n", ")\n", "\n", - "# run the analysis without having to re-iniatialize the model\n", - "for num, key in tqdm(\n", - " enumerate(image_dict.keys()), total=len(image_dict)\n", - "): # loop through all images\n", - " image_dict[key] = image_summary_detector.analyse_image(\n", - " subdict=image_dict[key], analysis_type=\"summary\"\n", - " ) # analyse image with SummaryDetector and update dict\n", - "\n", - " if (\n", - " num % dump_every == 0 | num == len(image_dict) - 1\n", - " ): # save results every dump_every to dump_file\n", - " image_df = ammico.get_dataframe(image_dict)\n", - " image_df.to_csv(dump_file)" + "image_summary_detector.analyse_images(analysis_type=\"summary\")" ] }, { @@ -394,6 +387,7 @@ "outputs": [], "source": [ "# initialize the models\n", + "# currently this does not work because of the way the summary detector is implemented\n", "image_summary_detector = ammico.SummaryDetector(\n", " subdict=image_dict, analysis_type=\"summary\", model_type=\"base\"\n", ")\n", diff --git a/ammico/text.py b/ammico/text.py index bf39cc6..4bec28c 100644 --- a/ammico/text.py +++ b/ammico/text.py @@ -67,7 +67,6 @@ class TextDetector(AnalysisMethod): def __init__( self, subdict: dict, - analyse_text: bool = False, skip_extraction: bool = False, accept_privacy: str = "PRIVACY_AMMICO", ) -> None: @@ -76,8 +75,6 @@ class TextDetector(AnalysisMethod): Args: subdict (dict): Dictionary containing file name/path, and possibly previous analysis results from other modules. - analyse_text (bool, optional): Decide if extracted text will be further subject - to analysis. Defaults to False. skip_extraction (bool, optional): Decide if text will be extracted from images or is already provided via a csv. Defaults to False. accept_privacy (str, optional): Environment variable to accept the privacy @@ -96,17 +93,13 @@ class TextDetector(AnalysisMethod): "Privacy disclosure not accepted - skipping text detection." ) self.translator = Translator(raise_exception=True) - if not isinstance(analyse_text, bool): - raise ValueError("analyse_text needs to be set to true or false") - self.analyse_text = analyse_text self.skip_extraction = skip_extraction if not isinstance(skip_extraction, bool): raise ValueError("skip_extraction needs to be set to true or false") if self.skip_extraction: print("Skipping text extraction from image.") print("Reading text directly from provided dictionary.") - if self.analyse_text: - self._initialize_spacy() + self._initialize_spacy() def set_keys(self) -> dict: """Set the default keys for text analysis. @@ -183,7 +176,7 @@ class TextDetector(AnalysisMethod): self._truncate_text() self.translate_text() self.remove_linebreaks() - if self.analyse_text and self.subdict["text_english"]: + if self.subdict["text_english"]: self._run_spacy() return self.subdict diff --git a/pyproject.toml b/pyproject.toml index 1507f8b..c0f5440 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ dependencies = [ "spacy", "tensorflow<2.15", # instead of <=2.16.0 to make it compatible with CUDA 11.8, may change after updating CUDA version. "tf-keras", + "torchvision", "tqdm", "transformers>=4.54", "webcolors", From d1a4954669b4917fa78cb40f23ab9f3fe21437ed Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Fri, 26 Sep 2025 08:51:09 +0200 Subject: [PATCH 07/31] tests: remove analyse_text keyword for text detector --- ammico/test/test_display.py | 1 - ammico/test/test_text.py | 20 ++++---------------- 2 files changed, 4 insertions(+), 17 deletions(-) diff --git a/ammico/test/test_display.py b/ammico/test/test_display.py index 83d53dc..d7241a2 100644 --- a/ammico/test/test_display.py +++ b/ammico/test/test_display.py @@ -50,7 +50,6 @@ def test_right_output_analysis_emotions(get_AE, get_options, monkeypatch): get_options[3], get_options[0], "EmotionDetector", - True, "SOME_VAR", 50, 50, diff --git a/ammico/test/test_text.py b/ammico/test/test_text.py index cffb321..cd9f863 100644 --- a/ammico/test/test_text.py +++ b/ammico/test/test_text.py @@ -52,24 +52,16 @@ def test_privacy_statement(monkeypatch): def test_TextDetector(set_testdict, accepted): for item in set_testdict: test_obj = tt.TextDetector(set_testdict[item], accept_privacy=accepted) - assert not test_obj.analyse_text assert not test_obj.skip_extraction assert test_obj.subdict["filename"] == set_testdict[item]["filename"] - test_obj = tt.TextDetector( - {}, analyse_text=True, skip_extraction=True, accept_privacy=accepted - ) - assert test_obj.analyse_text + test_obj = tt.TextDetector({}, skip_extraction=True, accept_privacy=accepted) assert test_obj.skip_extraction - with pytest.raises(ValueError): - tt.TextDetector({}, analyse_text=1.0, accept_privacy=accepted) with pytest.raises(ValueError): tt.TextDetector({}, skip_extraction=1.0, accept_privacy=accepted) def test_run_spacy(set_testdict, get_path, accepted): - test_obj = tt.TextDetector( - set_testdict["IMG_3755"], analyse_text=True, accept_privacy=accepted - ) + test_obj = tt.TextDetector(set_testdict["IMG_3755"], accept_privacy=accepted) ref_file = get_path + "text_IMG_3755.txt" with open(ref_file, "r") as file: reference_text = file.read() @@ -108,15 +100,11 @@ def test_analyse_image(set_testdict, set_environ, accepted): for item in set_testdict: test_obj = tt.TextDetector(set_testdict[item], accept_privacy=accepted) test_obj.analyse_image() - test_obj = tt.TextDetector( - set_testdict[item], analyse_text=True, accept_privacy=accepted - ) + test_obj = tt.TextDetector(set_testdict[item], accept_privacy=accepted) test_obj.analyse_image() testdict = {} testdict["text"] = 20000 * "m" - test_obj = tt.TextDetector( - testdict, skip_extraction=True, analyse_text=True, accept_privacy=accepted - ) + test_obj = tt.TextDetector(testdict, skip_extraction=True, accept_privacy=accepted) test_obj.analyse_image() assert test_obj.subdict["text_truncated"] == 5000 * "m" assert test_obj.subdict["text"] == 20000 * "m" From ece132fe14a6139d88d2d7e7559ab6ed10980b21 Mon Sep 17 00:00:00 2001 From: DimasfromLavoisier Date: Fri, 26 Sep 2025 17:29:46 +0200 Subject: [PATCH 08/31] optimize validation of analysis type --- ammico/image_summary.py | 58 +++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/ammico/image_summary.py b/ammico/image_summary.py index 0cdaebe..3ccc3f4 100644 --- a/ammico/image_summary.py +++ b/ammico/image_summary.py @@ -6,7 +6,7 @@ import torch from PIL import Image import warnings -from typing import List, Optional, Union, Dict, Any +from typing import List, Optional, Union, Dict, Any, Tuple from collections.abc import Sequence as _Sequence from transformers import GenerationConfig from qwen_vl_utils import process_vision_info @@ -118,6 +118,36 @@ class ImageSummaryDetector(AnalysisMethod): return inputs + def _validate_analysis_type( + self, + analysis_type: Union["AnalysisType", str], + list_of_questions: Optional[List[str]], + max_questions_per_image: int, + ) -> Tuple[str, List[str], bool, bool]: + if isinstance(analysis_type, AnalysisType): + analysis_type = analysis_type.value + + allowed = {"summary", "questions", "summary_and_questions"} + if analysis_type not in allowed: + raise ValueError(f"analysis_type must be one of {allowed}") + + if list_of_questions is None: + list_of_questions = [ + "Are there people in the image?", + "What is this picture about?", + ] + + if analysis_type in ("questions", "summary_and_questions"): + if len(list_of_questions) > max_questions_per_image: + raise ValueError( + f"Number of questions per image ({len(list_of_questions)}) exceeds safety cap ({max_questions_per_image}). Reduce questions or increase max_questions_per_image." + ) + + is_summary = analysis_type in ("summary", "summary_and_questions") + is_questions = analysis_type in ("questions", "summary_and_questions") + + return analysis_type, list_of_questions, is_summary, is_questions + def analyse_images( self, analysis_type: Union[AnalysisType, str] = AnalysisType.SUMMARY_AND_QUESTIONS, @@ -141,25 +171,18 @@ class ImageSummaryDetector(AnalysisMethod): self.subdict (dict): dictionary with analysis results. """ # TODO: add option to ask multiple questions per image as one batch. - if isinstance(analysis_type, AnalysisType): - analysis_type = analysis_type.value - - allowed = {"summary", "questions", "summary_and_questions"} - if analysis_type not in allowed: - raise ValueError(f"analysis_type must be one of {allowed}") - - if list_of_questions is None: - list_of_questions = [ - "Are there people in the image?", - "What is this picture about?", - ] + analysis_type, list_of_questions, is_summary, is_questions = ( + self._validate_analysis_type( + analysis_type, list_of_questions, max_questions_per_image + ) + ) keys = list(self.subdict.keys()) for batch_start in range(0, len(keys), keys_batch_size): batch_keys = keys[batch_start : batch_start + keys_batch_size] for key in batch_keys: entry = self.subdict[key] - if analysis_type in ("summary", "summary_and_questions"): + if is_summary: try: caps = self.generate_caption( entry, @@ -172,12 +195,7 @@ class ImageSummaryDetector(AnalysisMethod): "Caption generation failed for key %s: %s", key, e ) - if analysis_type in ("questions", "summary_and_questions"): - if len(list_of_questions) > max_questions_per_image: - raise ValueError( - f"Number of questions per image ({len(list_of_questions)}) exceeds safety cap ({max_questions_per_image})." - " Reduce questions or increase max_questions_per_image." - ) + if is_questions: try: vqa_map = self.answer_questions( list_of_questions, entry, is_concise_answer From 5c7e2c3f640241fb00b4a51d82e9d200675e4a1e Mon Sep 17 00:00:00 2001 From: DimasfromLavoisier Date: Fri, 26 Sep 2025 18:23:58 +0200 Subject: [PATCH 09/31] 1st try --- ammico/display.py | 79 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/ammico/display.py b/ammico/display.py index 2e7c9e4..ff86a3e 100644 --- a/ammico/display.py +++ b/ammico/display.py @@ -5,6 +5,8 @@ import pandas as pd from dash import html, Input, Output, dcc, State, Dash from PIL import Image import dash_bootstrap_components as dbc +import warnings +from typing import Dict, Any, List COLOR_SCHEMES = [ @@ -94,6 +96,9 @@ class AnalysisExplorer: State("left_select_id", "options"), State("left_select_id", "value"), State("Dropdown_select_Detector", "value"), + State("Dropdown_analysis_type", "value"), + State("checkbox_enable_image_tasks", "value"), + State("textarea_questions", "value"), State("setting_privacy_env_var", "value"), State("setting_Emotion_emotion_threshold", "value"), State("setting_Emotion_race_threshold", "value"), @@ -291,6 +296,37 @@ class AnalysisExplorer: ), justify="start", ), + # NEW: Analysis-type selector (summary/questions/summary_and_questions) + dbc.Row( + dcc.Dropdown( + id="Dropdown_analysis_type", + options=[{"label": v, "value": v} for v in SUMMARY_ANALYSIS_TYPE], + value="summary_and_questions", + style={"width": "60%", "margin-top": "8px"}, + ), + justify="start", + ), + # NEW: Enable image-level tasks (VQA / caption) checkbox + dbc.Row( + dcc.Checklist( + id="checkbox_enable_image_tasks", + options=[{"label": "Enable Image Tasks (Caption / VQA)", "value": "enabled"}], + value=["enabled"], # default enabled + inline=True, + style={"margin-top": "8px"}, + ), + justify="start", + ), + # NEW: Questions textarea (newline-separated). Only used if analysis_type includes "questions". + dbc.Row( + dcc.Textarea( + id="textarea_questions", + value="Are there people in the image?\nWhat is this picture about?", + placeholder="One question per line...", + style={"width": "60%", "height": "120px", "margin-top": "8px"}, + ), + justify="start", + ), dbc.Row( children=[self._create_setting_layout()], id="div_detector_args", @@ -383,6 +419,7 @@ class AnalysisExplorer: current_img_value: str, detector_value: str, setting_privacy_env_var: str, + checkbox_enable_image_tasks_value: List[str], setting_emotion_emotion_threshold: int, setting_emotion_race_threshold: int, setting_emotion_gender_threshold: int, @@ -414,6 +451,10 @@ class AnalysisExplorer: # detector value is the string name of the chosen detector identify_function = identify_dict[detector_value] + identify_function = identify_dict.get(detector_value) + if identify_function is None: + detector_class = None + if detector_value == "TextDetector": detector_class = identify_function( image_copy, @@ -442,8 +483,32 @@ class AnalysisExplorer: ) else: detector_class = identify_function(image_copy) - analysis_dict = detector_class.analyse_image() + + if detector_class is not None: + analysis_dict = detector_class.analyse_image() + else: + analysis_dict = {} + image_tasks_result: Dict[str, Any] = {} + enable_image_tasks = "enabled" in (checkbox_enable_image_tasks_value or []) + if enable_image_tasks: + # parse questions textarea: newline separated + if textarea_questions_value: + questions_list = [q.strip() for q in textarea_questions_value.splitlines() if q.strip()] + else: + questions_list = None + + try: + image_tasks_result = self.analyse_image( + image_copy, + analysis_type=analysis_type_value, + list_of_questions=questions_list, + is_concise_summary=True, + is_concise_answer=True, + ) + except Exception as e: + warnings.warn(f"Image tasks failed: {e}") + image_tasks_result = {"image_tasks_error": str(e)} # Initialize an empty dictionary new_analysis_dict = {} @@ -459,6 +524,18 @@ class AnalysisExplorer: # Add the new key-value pair to the new dictionary new_analysis_dict[k] = new_value + if "caption" in image_tasks_result: + new_analysis_dict["caption"] = image_tasks_result.get("caption", "") + if "vqa" in image_tasks_result: + # vqa is expected to be a dict; convert to readable string + vqa_entries = image_tasks_result["vqa"] + if isinstance(vqa_entries, dict): + new_analysis_dict["vqa"] = "; ".join([f"{q}: {a}" for q, a in vqa_entries.items()]) + else: + new_analysis_dict["vqa"] = str(vqa_entries) + for err_key in ("caption_error", "vqa_error", "image_tasks_error"): + if err_key in image_tasks_result: + new_analysis_dict[err_key] = image_tasks_result[err_key] df = pd.DataFrame([new_analysis_dict]).set_index("filename").T df.index.rename("filename", inplace=True) From 402a379f9c819d89b752ebb4c27af261c3f7c097 Mon Sep 17 00:00:00 2001 From: Dmitrii Kapitan Date: Sat, 27 Sep 2025 16:42:05 +0200 Subject: [PATCH 10/31] basic integration into display functionality --- ammico/display.py | 251 +++++++++++++++++++++++----------------- ammico/image_summary.py | 58 ++++++++-- 2 files changed, 192 insertions(+), 117 deletions(-) diff --git a/ammico/display.py b/ammico/display.py index ff86a3e..b916dbf 100644 --- a/ammico/display.py +++ b/ammico/display.py @@ -1,12 +1,14 @@ import ammico.faces as faces import ammico.text as text import ammico.colors as colors +import ammico.image_summary as image_summary +from ammico.model import MultimodalSummaryModel import pandas as pd from dash import html, Input, Output, dcc, State, Dash from PIL import Image import dash_bootstrap_components as dbc import warnings -from typing import Dict, Any, List +from typing import Dict, Any, List, Optional COLOR_SCHEMES = [ @@ -97,7 +99,6 @@ class AnalysisExplorer: State("left_select_id", "value"), State("Dropdown_select_Detector", "value"), State("Dropdown_analysis_type", "value"), - State("checkbox_enable_image_tasks", "value"), State("textarea_questions", "value"), State("setting_privacy_env_var", "value"), State("setting_Emotion_emotion_threshold", "value"), @@ -112,9 +113,15 @@ class AnalysisExplorer: Output("settings_TextDetector", "style"), Output("settings_EmotionDetector", "style"), Output("settings_ColorDetector", "style"), + Output("settings_VQA", "style"), Input("Dropdown_select_Detector", "value"), )(self._update_detector_setting) + self.app.callback( + Output("textarea_questions", "style"), + Input("Dropdown_analysis_type", "value"), + )(self._show_questions_textarea_on_demand) + # I split the different sections into subfunctions for better clarity def _top_file_explorer(self, mydict: dict) -> html.Div: """Initialize the file explorer dropdown for selecting the file to be analyzed. @@ -268,8 +275,69 @@ class AnalysisExplorer: ) ], ), + # start VQA settings + html.Div( + id="settings_VQA", + style={"display": "none"}, + children=[ + dbc.Card( + [ + dbc.CardBody( + [ + dbc.Row( + dbc.Col( + dcc.Dropdown( + id="Dropdown_analysis_type", + options=[ + {"label": v, "value": v} + for v in SUMMARY_ANALYSIS_TYPE + ], + value="summary_and_questions", + clearable=False, + style={ + "width": "100%", + "minWidth": "240px", + "maxWidth": "520px", + }, + ), + ), + justify="start", + ), + html.Div(style={"height": "8px"}), + dbc.Row( + [ + dbc.Col( + dcc.Textarea( + id="textarea_questions", + value="Are there people in the image?\nWhat is this picture about?", + placeholder="One question per line...", + style={ + "width": "100%", + "minHeight": "160px", + "height": "220px", + "resize": "vertical", + "overflow": "auto", + }, + rows=8, + ), + width=12, + ), + ], + justify="start", + ), + ] + ) + ], + style={ + "width": "100%", + "marginTop": "10px", + "zIndex": 2000, + }, + ) + ], + ), ], - style={"width": "100%", "display": "inline-block"}, + style={"width": "100%", "display": "inline-block", "overflow": "visible"}, ) return settings_layout @@ -289,6 +357,7 @@ class AnalysisExplorer: "TextDetector", "EmotionDetector", "ColorDetector", + "VQA", ], value="TextDetector", id="Dropdown_select_Detector", @@ -296,37 +365,6 @@ class AnalysisExplorer: ), justify="start", ), - # NEW: Analysis-type selector (summary/questions/summary_and_questions) - dbc.Row( - dcc.Dropdown( - id="Dropdown_analysis_type", - options=[{"label": v, "value": v} for v in SUMMARY_ANALYSIS_TYPE], - value="summary_and_questions", - style={"width": "60%", "margin-top": "8px"}, - ), - justify="start", - ), - # NEW: Enable image-level tasks (VQA / caption) checkbox - dbc.Row( - dcc.Checklist( - id="checkbox_enable_image_tasks", - options=[{"label": "Enable Image Tasks (Caption / VQA)", "value": "enabled"}], - value=["enabled"], # default enabled - inline=True, - style={"margin-top": "8px"}, - ), - justify="start", - ), - # NEW: Questions textarea (newline-separated). Only used if analysis_type includes "questions". - dbc.Row( - dcc.Textarea( - id="textarea_questions", - value="Are there people in the image?\nWhat is this picture about?", - placeholder="One question per line...", - style={"width": "60%", "height": "120px", "margin-top": "8px"}, - ), - justify="start", - ), dbc.Row( children=[self._create_setting_layout()], id="div_detector_args", @@ -402,15 +440,22 @@ class AnalysisExplorer: } if setting_input == "TextDetector": - return display_flex, display_none, display_none + return display_flex, display_none, display_none, display_none if setting_input == "EmotionDetector": - return display_none, display_flex, display_none + return display_none, display_flex, display_none, display_none if setting_input == "ColorDetector": - return display_none, display_none, display_flex - + return display_none, display_none, display_flex, display_none + if setting_input == "VQA": + return display_none, display_none, display_none, display_flex else: - return display_none, display_none, display_none + return display_none, display_none, display_none, display_none + + def _parse_questions(self, text: Optional[str]) -> Optional[List[str]]: + if not text: + return None + qs = [q.strip() for q in text.splitlines() if q.strip()] + return qs if qs else None def _right_output_analysis( self, @@ -418,8 +463,9 @@ class AnalysisExplorer: all_img_options: dict, current_img_value: str, detector_value: str, + analysis_type_value: str, + textarea_questions_value: str, setting_privacy_env_var: str, - checkbox_enable_image_tasks_value: List[str], setting_emotion_emotion_threshold: int, setting_emotion_race_threshold: int, setting_emotion_gender_threshold: int, @@ -439,78 +485,71 @@ class AnalysisExplorer: "EmotionDetector": faces.EmotionDetector, "TextDetector": text.TextDetector, "ColorDetector": colors.ColorDetector, + "VQA": image_summary.ImageSummaryDetector, } # Get image ID from dropdown value, which is the filepath if current_img_value is None: return {} image_id = all_img_options[current_img_value] - # copy image so prvious runs don't leave their default values in the dict - image_copy = self.mydict[image_id].copy() - - # detector value is the string name of the chosen detector - identify_function = identify_dict[detector_value] - - identify_function = identify_dict.get(detector_value) - if identify_function is None: - detector_class = None - - if detector_value == "TextDetector": - detector_class = identify_function( - image_copy, - accept_privacy=( - setting_privacy_env_var - if setting_privacy_env_var - else "PRIVACY_AMMICO" - ), - ) - elif detector_value == "EmotionDetector": - detector_class = identify_function( - image_copy, - emotion_threshold=setting_emotion_emotion_threshold, - race_threshold=setting_emotion_race_threshold, - gender_threshold=setting_emotion_gender_threshold, - accept_disclosure=( - setting_emotion_env_var - if setting_emotion_env_var - else "DISCLOSURE_AMMICO" - ), - ) - elif detector_value == "ColorDetector": - detector_class = identify_function( - image_copy, - delta_e_method=setting_color_delta_e_method, - ) - else: - detector_class = identify_function(image_copy) - - if detector_class is not None: - analysis_dict = detector_class.analyse_image() - else: - analysis_dict = {} - - image_tasks_result: Dict[str, Any] = {} - enable_image_tasks = "enabled" in (checkbox_enable_image_tasks_value or []) - if enable_image_tasks: - # parse questions textarea: newline separated - if textarea_questions_value: - questions_list = [q.strip() for q in textarea_questions_value.splitlines() if q.strip()] - else: - questions_list = None + image_copy = self.mydict.get(image_id, {}).copy() + analysis_dict: Dict[str, Any] = {} + if detector_value == "VQA": try: - image_tasks_result = self.analyse_image( + qwen_model = MultimodalSummaryModel( + model_id="Qwen/Qwen2.5-VL-3B-Instruct" + ) # TODO: allow user to specify model + vqa_cls = identify_dict.get("VQA") + vqa_detector = vqa_cls(qwen_model, subdict={}) + questions_list = self._parse_questions(textarea_questions_value) + analysis_result = vqa_detector.analyse_image( image_copy, analysis_type=analysis_type_value, list_of_questions=questions_list, is_concise_summary=True, is_concise_answer=True, ) + analysis_dict = analysis_result or {} except Exception as e: - warnings.warn(f"Image tasks failed: {e}") - image_tasks_result = {"image_tasks_error": str(e)} - # Initialize an empty dictionary - new_analysis_dict = {} + warnings.warn(f"VQA/Image tasks failed: {e}") + analysis_dict = {"image_tasks_error": str(e)} + else: + # detector value is the string name of the chosen detector + identify_function = identify_dict[detector_value] + + if detector_value == "TextDetector": + detector_class = identify_function( + image_copy, + accept_privacy=( + setting_privacy_env_var + if setting_privacy_env_var + else "PRIVACY_AMMICO" + ), + ) + elif detector_value == "EmotionDetector": + detector_class = identify_function( + image_copy, + emotion_threshold=setting_emotion_emotion_threshold, + race_threshold=setting_emotion_race_threshold, + gender_threshold=setting_emotion_gender_threshold, + accept_disclosure=( + setting_emotion_env_var + if setting_emotion_env_var + else "DISCLOSURE_AMMICO" + ), + ) + elif detector_value == "ColorDetector": + detector_class = identify_function( + image_copy, + delta_e_method=setting_color_delta_e_method, + ) + else: + detector_class = identify_function(image_copy) + + analysis_dict = detector_class.analyse_image() + + new_analysis_dict: Dict[str, Any] = {} # Iterate over the items in the original dictionary for k, v in analysis_dict.items(): @@ -524,21 +563,15 @@ class AnalysisExplorer: # Add the new key-value pair to the new dictionary new_analysis_dict[k] = new_value - if "caption" in image_tasks_result: - new_analysis_dict["caption"] = image_tasks_result.get("caption", "") - if "vqa" in image_tasks_result: - # vqa is expected to be a dict; convert to readable string - vqa_entries = image_tasks_result["vqa"] - if isinstance(vqa_entries, dict): - new_analysis_dict["vqa"] = "; ".join([f"{q}: {a}" for q, a in vqa_entries.items()]) - else: - new_analysis_dict["vqa"] = str(vqa_entries) - for err_key in ("caption_error", "vqa_error", "image_tasks_error"): - if err_key in image_tasks_result: - new_analysis_dict[err_key] = image_tasks_result[err_key] df = pd.DataFrame([new_analysis_dict]).set_index("filename").T df.index.rename("filename", inplace=True) return dbc.Table.from_dataframe( df, striped=True, bordered=True, hover=True, index=True ) + + def _show_questions_textarea_on_demand(self, analysis_type_value: str) -> dict: + if analysis_type_value in ("questions", "summary_and_questions"): + return {"display": "block", "width": "100%"} + else: + return {"display": "none"} diff --git a/ammico/image_summary.py b/ammico/image_summary.py index 3ccc3f4..203ef21 100644 --- a/ammico/image_summary.py +++ b/ammico/image_summary.py @@ -16,7 +16,7 @@ class ImageSummaryDetector(AnalysisMethod): def __init__( self, summary_model: MultimodalSummaryModel, - subdict: dict = {}, + subdict: Optional[Dict[str, Any]] = None, ) -> None: """ Class for analysing images using QWEN-2.5-VL model. @@ -29,6 +29,8 @@ class ImageSummaryDetector(AnalysisMethod): Returns: None. """ + if subdict is None: + subdict = {} super().__init__(subdict) self.summary_model = summary_model @@ -148,7 +150,50 @@ class ImageSummaryDetector(AnalysisMethod): return analysis_type, list_of_questions, is_summary, is_questions - def analyse_images( + def analyse_image( + self, + entry: dict, + analysis_type: Union[str, AnalysisType] = AnalysisType.SUMMARY_AND_QUESTIONS, + list_of_questions: Optional[List[str]] = None, + max_questions_per_image: int = 32, + is_concise_summary: bool = True, + is_concise_answer: bool = True, + ) -> Dict[str, Any]: + """ + Analyse a single image entry. Returns dict with keys depending on analysis_type: + - 'caption' (str) if summary requested + - 'vqa' (dict) if questions requested + """ + self.subdict = entry + analysis_type, list_of_questions, is_summary, is_questions = ( + self._validate_analysis_type( + analysis_type, list_of_questions, max_questions_per_image + ) + ) + + if is_summary: + try: + caps = self.generate_caption( + entry, + num_return_sequences=1, + is_concise_summary=is_concise_summary, + ) + self.subdict["caption"] = caps[0] if caps else "" + except Exception as e: + warnings.warn(f"Caption generation failed: {e}") + + if is_questions: + try: + vqa_map = self.answer_questions( + list_of_questions, entry, is_concise_answer + ) + self.subdict["vqa"] = vqa_map + except Exception as e: + warnings.warn(f"VQA failed: {e}") + + return self.subdict + + def analyse_images_from_dict( self, analysis_type: Union[AnalysisType, str] = AnalysisType.SUMMARY_AND_QUESTIONS, list_of_questions: Optional[List[str]] = None, @@ -191,9 +236,7 @@ class ImageSummaryDetector(AnalysisMethod): ) entry["caption"] = caps[0] if caps else "" except Exception as e: - warnings.warn( - "Caption generation failed for key %s: %s", key, e - ) + warnings.warn(f"Caption generation failed: {e}") if is_questions: try: @@ -202,7 +245,7 @@ class ImageSummaryDetector(AnalysisMethod): ) entry["vqa"] = vqa_map except Exception as e: - warnings.warn("VQA failed for key %s: %s", key, e) + warnings.warn(f"VQA failed: {e}") self.subdict[key] = entry return self.subdict @@ -251,8 +294,7 @@ class ImageSummaryDetector(AnalysisMethod): ) except RuntimeError as e: warnings.warn( - "Retry without autocast failed: %s. Attempting cudnn-disabled retry.", - e, + f"Retry without autocast failed: {e}. Attempting cudnn-disabled retry." ) cudnn_was_enabled = ( torch.backends.cudnn.is_available() and torch.backends.cudnn.enabled From 75b9bc101bd122232ceab01b598423b86b3f939b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 19 Sep 2025 10:33:59 +0200 Subject: [PATCH 11/31] [pre-commit.ci] pre-commit autoupdate (#265) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.12.10 → v0.13.0](https://github.com/astral-sh/ruff-pre-commit/compare/v0.12.10...v0.13.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7b28fc9..6dd7854 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ repos: files: ".ipynb" - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.12.10 + rev: v0.13.0 hooks: # Run the linter. - id: ruff-check From 32d032595d1632fb77263c6e1407cf1856a10d11 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Sep 2025 09:53:38 +0200 Subject: [PATCH 12/31] [pre-commit.ci] pre-commit autoupdate (#267) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.13.0 → v0.13.1](https://github.com/astral-sh/ruff-pre-commit/compare/v0.13.0...v0.13.1) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6dd7854..d879bc0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ repos: files: ".ipynb" - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.13.0 + rev: v0.13.1 hooks: # Run the linter. - id: ruff-check From 483f128f9608c5c376c8334f2fc8dcbf4af403c8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Oct 2025 09:26:18 +0200 Subject: [PATCH 13/31] [pre-commit.ci] pre-commit autoupdate (#269) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.13.1 → v0.13.3](https://github.com/astral-sh/ruff-pre-commit/compare/v0.13.1...v0.13.3) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d879bc0..03d5cf5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ repos: files: ".ipynb" - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.13.1 + rev: v0.13.3 hooks: # Run the linter. - id: ruff-check From 3018800ed47843c88fc9956f2755bec399ee37f6 Mon Sep 17 00:00:00 2001 From: DimasfromLavoisier Date: Thu, 9 Oct 2025 17:15:32 +0200 Subject: [PATCH 14/31] update test-display --- ammico/display.py | 6 ++++++ ammico/test/test_display.py | 3 +++ 2 files changed, 9 insertions(+) diff --git a/ammico/display.py b/ammico/display.py index b916dbf..5b860a7 100644 --- a/ammico/display.py +++ b/ammico/display.py @@ -100,6 +100,7 @@ class AnalysisExplorer: State("Dropdown_select_Detector", "value"), State("Dropdown_analysis_type", "value"), State("textarea_questions", "value"), + State("setting_Text_analyse_text", "value"), State("setting_privacy_env_var", "value"), State("setting_Emotion_emotion_threshold", "value"), State("setting_Emotion_race_threshold", "value"), @@ -465,6 +466,7 @@ class AnalysisExplorer: detector_value: str, analysis_type_value: str, textarea_questions_value: str, + settings_text_analyse_text: list, setting_privacy_env_var: str, setting_emotion_emotion_threshold: int, setting_emotion_race_threshold: int, @@ -519,8 +521,12 @@ class AnalysisExplorer: identify_function = identify_dict[detector_value] if detector_value == "TextDetector": + analyse_text = ( + True if settings_text_analyse_text == ["Analyse text"] else False + ) detector_class = identify_function( image_copy, + analyse_text=analyse_text, accept_privacy=( setting_privacy_env_var if setting_privacy_env_var diff --git a/ammico/test/test_display.py b/ammico/test/test_display.py index d7241a2..3cdb333 100644 --- a/ammico/test/test_display.py +++ b/ammico/test/test_display.py @@ -50,6 +50,9 @@ def test_right_output_analysis_emotions(get_AE, get_options, monkeypatch): get_options[3], get_options[0], "EmotionDetector", + "summary", + "Some question", + True, "SOME_VAR", 50, 50, From d810dbc3669d640098a4135f6c06c25845729539 Mon Sep 17 00:00:00 2001 From: DimasfromLavoisier Date: Fri, 10 Oct 2025 17:05:48 +0200 Subject: [PATCH 15/31] add base model tests --- ammico/model.py | 15 +++++++++++++++ ammico/test/conftest.py | 10 ++++++++++ ammico/test/test_model.py | 27 +++++++++++++++++++++++++++ pyproject.toml | 3 +-- 4 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 ammico/test/test_model.py diff --git a/ammico/model.py b/ammico/model.py index 80cc31f..cdc1161 100644 --- a/ammico/model.py +++ b/ammico/model.py @@ -27,6 +27,15 @@ class MultimodalSummaryModel: cache_dir: huggingface cache dir (optional). """ self.device = self._resolve_device(device) + + if model_id is not None and model_id not in ( + self.DEFAULT_CUDA_MODEL, + self.DEFAULT_CPU_MODEL, + ): + raise ValueError( + f"model_id must be one of {self.DEFAULT_CUDA_MODEL} or {self.DEFAULT_CPU_MODEL}" + ) + self.model_id = model_id or ( self.DEFAULT_CUDA_MODEL if self.device == "cuda" else self.DEFAULT_CPU_MODEL ) @@ -94,6 +103,12 @@ class MultimodalSummaryModel: if self.model is not None: del self.model self.model = None + if self.processor is not None: + del self.processor + self.processor = None + if self.tokenizer is not None: + del self.tokenizer + self.tokenizer = None finally: try: if torch.cuda.is_available(): diff --git a/ammico/test/conftest.py b/ammico/test/conftest.py index cb42774..2010e1e 100644 --- a/ammico/test/conftest.py +++ b/ammico/test/conftest.py @@ -1,5 +1,6 @@ import os import pytest +from ammico.model import MultimodalSummaryModel @pytest.fixture @@ -46,3 +47,12 @@ def get_test_my_dict(get_path): }, } return test_my_dict + + +@pytest.fixture(scope="session") +def model(): + m = MultimodalSummaryModel(device="cpu") + try: + yield m + finally: + m.close() diff --git a/ammico/test/test_model.py b/ammico/test/test_model.py new file mode 100644 index 0000000..ac652c0 --- /dev/null +++ b/ammico/test/test_model.py @@ -0,0 +1,27 @@ +import pytest +import torch +from ammico.model import MultimodalSummaryModel + + +def test_model_init(model): + assert model.model is not None + assert model.processor is not None + assert model.tokenizer is not None + assert model.device is not None + + +def test_model_invalid_device(): + with pytest.raises(ValueError): + MultimodalSummaryModel(device="invalid_device") + + +def test_model_invalid_model_id(): + with pytest.raises(ValueError): + MultimodalSummaryModel(model_id="non_existent_model", device="cpu") + + +def test_free_resources(): + model = MultimodalSummaryModel(device="cpu") + model.close() + assert model.model is None + assert model.processor is None diff --git a/pyproject.toml b/pyproject.toml index c0f5440..4f29217 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,10 +26,9 @@ dependencies = [ "colour-science", "dash", "dash-bootstrap-components", - "decord", "deepface", "google-cloud-vision", - "googletrans-py", # instead of googletrans4.0.0rc1, for a temporary solution due the incompatibility with jupyterlab + "googletrans-py", # instead of googletrans4.0.0rc1, for a temporary solution due to incompatibility with jupyterlab "grpcio", "huggingface-hub>=0.34.0", "importlib_metadata", From d6e0fbeffe6841839d0ae6114f08525b43567631 Mon Sep 17 00:00:00 2001 From: DimasfromLavoisier Date: Mon, 13 Oct 2025 13:51:24 +0200 Subject: [PATCH 16/31] add vqa tests --- ammico/test/test_image_summary.py | 37 +++++++++++++++++++++++++++++++ ammico/test/test_model.py | 5 ++++- pyproject.toml | 2 +- 3 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 ammico/test/test_image_summary.py diff --git a/ammico/test/test_image_summary.py b/ammico/test/test_image_summary.py new file mode 100644 index 0000000..ad48298 --- /dev/null +++ b/ammico/test/test_image_summary.py @@ -0,0 +1,37 @@ +from ammico.image_summary import ImageSummaryDetector + +import pytest + + +@pytest.mark.long +def test_image_summary_detector(model, get_testdict): + detector = ImageSummaryDetector(summary_model=model, subdict=get_testdict) + results = detector.analyse_images_from_dict(analysis_type="summary") + assert len(results) == 2 + for key in get_testdict.keys(): + assert key in results + assert "caption" in results[key] + assert isinstance(results[key]["caption"], str) + assert len(results[key]["caption"]) > 0 + + +@pytest.mark.long +def test_image_summary_detector_questions(model, get_testdict): + list_of_questions = [ + "What is happening in the image?", + "How many cars are in the image in total?", + ] + detector = ImageSummaryDetector(summary_model=model, subdict=get_testdict) + results = detector.analyse_images_from_dict( + analysis_type="questions", list_of_questions=list_of_questions + ) + assert len(results) == 2 + for key in get_testdict.keys(): + assert "vqa" in results[key] + if key == "IMG_2746": + assert "marathon" in results[key]["vqa"][0].lower() + + if key == "IMG_2809": + assert ( + "two" in results[key]["vqa"][1].lower() or "2" in results[key]["vqa"][1] + ) diff --git a/ammico/test/test_model.py b/ammico/test/test_model.py index ac652c0..d82dd86 100644 --- a/ammico/test/test_model.py +++ b/ammico/test/test_model.py @@ -1,8 +1,8 @@ import pytest -import torch from ammico.model import MultimodalSummaryModel +@pytest.mark.long def test_model_init(model): assert model.model is not None assert model.processor is not None @@ -10,16 +10,19 @@ def test_model_init(model): assert model.device is not None +@pytest.mark.long def test_model_invalid_device(): with pytest.raises(ValueError): MultimodalSummaryModel(device="invalid_device") +@pytest.mark.long def test_model_invalid_model_id(): with pytest.raises(ValueError): MultimodalSummaryModel(model_id="non_existent_model", device="cpu") +@pytest.mark.long def test_free_resources(): model = MultimodalSummaryModel(device="cpu") model.close() diff --git a/pyproject.toml b/pyproject.toml index 4f29217..1ead16e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ dependencies = [ "pandas", "Pillow", "pooch", - "qwen-vl-utils[decord]==0.0.8", + "qwen-vl-utils", "retina_face", "safetensors>=0.6.2", "setuptools", From af97981547b2a88609eb9ceafe4e74e93081f085 Mon Sep 17 00:00:00 2001 From: Dmitrii Kapitan Date: Tue, 14 Oct 2025 11:31:27 +0200 Subject: [PATCH 17/31] Excluding `long` tests from github actions, since there is not enough memory for it --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 015a8ee..1b1675a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,7 +31,7 @@ jobs: - name: Run pytest run: | cd ammico - python -m pytest -svv -m "not gcv" --cov=. --cov-report=xml + python -m pytest -svv -m "not gcv and not long" --cov=. --cov-report=xml - name: Upload coverage if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.11' uses: codecov/codecov-action@v3 From 4a04233536518e03989f8d5b7cf492f02cdb9072 Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Fri, 17 Oct 2025 14:41:57 +0200 Subject: [PATCH 18/31] UI: fix obsolete analyze_test keyword --- ammico/display.py | 6 ------ ammico/test/test_display.py | 1 - 2 files changed, 7 deletions(-) diff --git a/ammico/display.py b/ammico/display.py index 5b860a7..b916dbf 100644 --- a/ammico/display.py +++ b/ammico/display.py @@ -100,7 +100,6 @@ class AnalysisExplorer: State("Dropdown_select_Detector", "value"), State("Dropdown_analysis_type", "value"), State("textarea_questions", "value"), - State("setting_Text_analyse_text", "value"), State("setting_privacy_env_var", "value"), State("setting_Emotion_emotion_threshold", "value"), State("setting_Emotion_race_threshold", "value"), @@ -466,7 +465,6 @@ class AnalysisExplorer: detector_value: str, analysis_type_value: str, textarea_questions_value: str, - settings_text_analyse_text: list, setting_privacy_env_var: str, setting_emotion_emotion_threshold: int, setting_emotion_race_threshold: int, @@ -521,12 +519,8 @@ class AnalysisExplorer: identify_function = identify_dict[detector_value] if detector_value == "TextDetector": - analyse_text = ( - True if settings_text_analyse_text == ["Analyse text"] else False - ) detector_class = identify_function( image_copy, - analyse_text=analyse_text, accept_privacy=( setting_privacy_env_var if setting_privacy_env_var diff --git a/ammico/test/test_display.py b/ammico/test/test_display.py index 3cdb333..d1b4cae 100644 --- a/ammico/test/test_display.py +++ b/ammico/test/test_display.py @@ -52,7 +52,6 @@ def test_right_output_analysis_emotions(get_AE, get_options, monkeypatch): "EmotionDetector", "summary", "Some question", - True, "SOME_VAR", 50, 50, From 92c7ac1f6dd37469e12ee748f05277d93baa6bc7 Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Wed, 22 Oct 2025 14:00:22 +0200 Subject: [PATCH 19/31] build: fix version issues with numpy/tensorflow and ipywidgets --- FAQ.md | 4 ++++ pyproject.toml | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/FAQ.md b/FAQ.md index b78b90b..c282936 100644 --- a/FAQ.md +++ b/FAQ.md @@ -62,6 +62,10 @@ Be careful, it requires around 7 GB of disk space. ![Screenshot 2023-06-01 165712](https://github.com/ssciwr/AMMICO/assets/8105097/3dfb302f-c390-46a7-a700-4e044f56c30f) +### Version clashes between tensorflow and numpy + +Due to the `faces` module, the tensorflow version is currently fixed to at most `2.14.0`. This requires that `numpy` is restricted to `numpy==1.23.5`. If you experience issues with compatibility between tensorflow and numpy, you can try fixing the numpy version to this version. + ## What happens to the images that are sent to google Cloud Vision? You have to accept the privacy statement of ammico to run this type of analyis. diff --git a/pyproject.toml b/pyproject.toml index 1ead16e..3516875 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ dependencies = [ "importlib_metadata", "importlib_resources", "matplotlib", - "numpy", + "numpy==1.23.5", "opencv-python", "pandas", "Pillow", @@ -69,6 +69,7 @@ nb = [ "datasets", "huggingface-hub", "ipython", + "ipykernel<=6.30.1", "jupyter", "jupyter_dash", ] From 8487947f5d5925756939d1c777caaad1717077a2 Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Thu, 23 Oct 2025 10:02:09 +0200 Subject: [PATCH 20/31] fix: update function call in notebook to conform with renaming --- ammico/notebooks/DemoImageSummaryVQA.ipynb | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/ammico/notebooks/DemoImageSummaryVQA.ipynb b/ammico/notebooks/DemoImageSummaryVQA.ipynb index b067e1f..c15e3e7 100644 --- a/ammico/notebooks/DemoImageSummaryVQA.ipynb +++ b/ammico/notebooks/DemoImageSummaryVQA.ipynb @@ -62,7 +62,7 @@ "outputs": [], "source": [ "image_dict = ammico.find_files(\n", - " path=str(\"/insert/your/path/here/\"),\n", + " path=str(\"../../data/in\"),\n", " limit=-1, # -1 means no limit on the number of files, by default it is set to 20\n", ")" ] @@ -114,7 +114,9 @@ "metadata": {}, "outputs": [], "source": [ - "summaries = img.analyse_images(analysis_type=\"summary\", is_concise_summary=False)" + "summaries = img.analyse_images_from_dict(\n", + " analysis_type=\"summary\", is_concise_summary=False\n", + ")" ] }, { @@ -158,17 +160,25 @@ "metadata": {}, "outputs": [], "source": [ - "vqa_results = img.analyse_images(\n", + "vqa_results = img.analyse_images_from_dict(\n", " analysis_type=\"questions\",\n", " list_of_questions=questions,\n", " is_concise_answer=True,\n", ")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "ammico-dev", + "display_name": "ammico", "language": "python", "name": "python3" }, @@ -182,7 +192,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.13" + "version": "3.11.14" } }, "nbformat": 4, From 1f2642f1b423a20734e3780c5d7e0dc04e7ab8f3 Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Fri, 24 Oct 2025 08:35:26 +0200 Subject: [PATCH 21/31] fix: update function call in notebook to conform with renaming --- ammico/notebooks/DemoNotebook_ammico.ipynb | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/ammico/notebooks/DemoNotebook_ammico.ipynb b/ammico/notebooks/DemoNotebook_ammico.ipynb index e17860c..6c3d355 100644 --- a/ammico/notebooks/DemoNotebook_ammico.ipynb +++ b/ammico/notebooks/DemoNotebook_ammico.ipynb @@ -142,7 +142,7 @@ "outputs": [], "source": [ "os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = (\n", - " \"/home/inga/projects/misinformation-project/misinformation-notes/misinformation-campaign-981aa55a3b13.json\"\n", + " \"../../data/misinformation-campaign-981aa55a3b13.json\"\n", ")" ] }, @@ -358,6 +358,11 @@ "For the computationally demanding `SummaryDetector`, it is best to initialize the model first and then analyze each image while passing the model explicitly. This can be done in a separate loop or in the same loop as for text and emotion detection." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -370,7 +375,7 @@ " subdict=image_dict, summary_model=model\n", ")\n", "\n", - "image_summary_detector.analyse_images(analysis_type=\"summary\")" + "image_summary_detector.analyse_images_from_dict(analysis_type=\"summary\")" ] }, { @@ -387,9 +392,9 @@ "outputs": [], "source": [ "# initialize the models\n", - "# currently this does not work because of the way the summary detector is implemented\n", - "image_summary_detector = ammico.SummaryDetector(\n", - " subdict=image_dict, analysis_type=\"summary\", model_type=\"base\"\n", + "model = ammico.MultimodalSummaryModel()\n", + "image_summary_detector = ammico.ImageSummaryDetector(\n", + " subdict=image_dict, summary_model=model\n", ")\n", "\n", "for num, key in tqdm(\n", @@ -399,9 +404,9 @@ " image_dict[key]\n", " ).analyse_image() # analyse image with EmotionDetector and update dict\n", " image_dict[key] = ammico.TextDetector(\n", - " image_dict[key], analyse_text=True\n", + " image_dict[key]\n", " ).analyse_image() # analyse image with TextDetector and update dict\n", - " image_dict[key] = image_summary_detector.analyse_image(\n", + " image_dict[key] = image_summary_detector.analyse_images_from_dict(\n", " subdict=image_dict[key], analysis_type=\"summary\"\n", " ) # analyse image with SummaryDetector and update dict\n", "\n", @@ -1553,7 +1558,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.13" + "version": "3.11.14" } }, "nbformat": 4, From a832142c442eadd09786795ee41dc616de7da1b9 Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Fri, 24 Oct 2025 08:54:38 +0200 Subject: [PATCH 22/31] fix: update function calls, remove obsolete stuff --- ammico/faces.py | 1 - ammico/notebooks/DemoNotebook_ammico.ipynb | 53 +++++++--------------- 2 files changed, 17 insertions(+), 37 deletions(-) diff --git a/ammico/faces.py b/ammico/faces.py index 3730cbd..debbfa6 100644 --- a/ammico/faces.py +++ b/ammico/faces.py @@ -275,7 +275,6 @@ class EmotionDetector(AnalysisMethod): # one dictionary per face that is detected in the image # since we are only passing a subregion of the image # that contains one face, the list will only contain one dict - print("actions are:", self.actions) if self.actions != []: fresult["result"] = DeepFace.analyze( img_path=face, diff --git a/ammico/notebooks/DemoNotebook_ammico.ipynb b/ammico/notebooks/DemoNotebook_ammico.ipynb index 6c3d355..9d9642d 100644 --- a/ammico/notebooks/DemoNotebook_ammico.ipynb +++ b/ammico/notebooks/DemoNotebook_ammico.ipynb @@ -276,8 +276,7 @@ "# dump file name\n", "dump_file = \"dump_file.csv\"\n", "# dump every N images\n", - "dump_every = 10\n", - "print(len(image_dict))" + "dump_every = 10" ] }, { @@ -395,8 +394,15 @@ "model = ammico.MultimodalSummaryModel()\n", "image_summary_detector = ammico.ImageSummaryDetector(\n", " subdict=image_dict, summary_model=model\n", - ")\n", - "\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "for num, key in tqdm(\n", " enumerate(image_dict.keys()), total=len(image_dict)\n", "): # loop through all images\n", @@ -470,7 +476,7 @@ "metadata": {}, "outputs": [], "source": [ - "image_df.to_csv(\"/content/drive/MyDrive/misinformation-data/data_out.csv\")" + "image_df.to_csv(\"data_out.csv\")" ] }, { @@ -489,7 +495,7 @@ "metadata": {}, "outputs": [], "source": [ - "ta = ammico.TextAnalyzer(csv_path=\"../data/ref/test.csv\", column_key=\"text\")" + "ta = ammico.TextAnalyzer(csv_path=\"test.csv\", column_key=\"text\")" ] }, { @@ -589,7 +595,7 @@ "metadata": {}, "outputs": [], "source": [ - "# os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = \"/content/drive/MyDrive/misinformation-data/misinformation-campaign-981aa55a3b13.json\"\n" + "# os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = \".json\"\n" ] }, { @@ -612,13 +618,6 @@ "): # loop through all images\n", " image_dict[key] = ammico.TextDetector(\n", " image_dict[key], # analyse image with TextDetector and update dict\n", - " analyse_text=True,\n", - " model_names=[\n", - " \"sshleifer/distilbart-cnn-12-6\",\n", - " \"distilbert-base-uncased-finetuned-sst-2-english\",\n", - " \"dbmdz/bert-large-cased-finetuned-conll03-english\",\n", - " ],\n", - " revision_numbers=[\"a4f8f3e\", \"af0f99b\", \"f2482bf\"],\n", " ).analyse_image()\n", "\n", " if (\n", @@ -636,7 +635,7 @@ "source": [ "# write output to csv\n", "image_df = ammico.get_dataframe(image_dict)\n", - "image_df.to_csv(\"/content/drive/MyDrive/misinformation-data/data_out.csv\")" + "image_df.to_csv(\"data_out.csv\")" ] }, { @@ -1035,7 +1034,7 @@ "source": [ "# write output to csv\n", "image_df = ammico.get_dataframe(image_dict)\n", - "image_df.to_csv(\"/content/drive/MyDrive/misinformation-data/data_out.csv\")" + "image_df.to_csv(\"data_out.csv\")" ] }, { @@ -1445,24 +1444,6 @@ "The output are N primary colors and their corresponding percentage." ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To check the analysis, you can inspect the analyzed elements here. Loading the results takes a moment, so please be patient. If you are sure of what you are doing, you can skip this and directly export a csv file in the step below.\n", - "Here, we display the color detection results provided by `colorgram` and `colour` libraries. Click on the tabs to see the results in the right sidebar. You may need to increment the `port` number if you are already running several notebook instances on the same server." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "analysis_explorer = ammico.AnalysisExplorer(image_dict)\n", - "analysis_explorer.run_server(port=8057)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1525,7 +1506,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.to_csv(\"/content/drive/MyDrive/misinformation-data/data_out.csv\")" + "df.to_csv(\"data_out.csv\")" ] }, { @@ -1533,7 +1514,7 @@ "metadata": {}, "source": [ "## Further detector modules\n", - "Further detector modules exist, also it is possible to carry out a topic analysis on the text data, as well as crop social media posts automatically. These are more experimental features and have their own demonstration notebooks." + "Please get in touch or open an issue, if you would like to propose further detector modules." ] }, { From ff6de1c4361e692e27f03f8b9fdfb2bd446d523b Mon Sep 17 00:00:00 2001 From: DimasfromLavoisier Date: Fri, 24 Oct 2025 11:19:30 +0200 Subject: [PATCH 23/31] update readme --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a2c4612..8fecbe2 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,10 @@ Use pre-processed image files such as social media posts with comments and proce 1. Content extraction from the images 1. Textual summary of the image content ("image caption") that can be analyzed further using the above tools 1. Feature extraction from the images: User inputs query and images are matched to that query (both text and image query) - 1. Question answering + 1. Question answering about image content +1. Content extractioni from the videos + 1. Textual summary of the video content that can be analyzed further + 1. Question answering about video content 1. Performing person and face recognition in images 1. Face mask detection 1. Probabilistic detection of age, gender and race @@ -69,7 +72,8 @@ The [Hugging Face transformers library](https://huggingface.co/) is used to perf ### Content extraction -The image content ("caption") is extracted using the [LAVIS](https://github.com/salesforce/LAVIS) library. This library enables vision intelligence extraction using several state-of-the-art models such as BLIP and BLIP2, depending on the task and user selection. Further, it allows feature extraction from the images, where users can input textual and image queries, and the images in the database are matched to that query (multimodal search). Another option is question answering, where the user inputs a text question and the library finds the images that match the query. +The image and video content ("caption") is now extracted using the Qwen2.5-VL +model. Qwen2.5-VL is a multimodal large language model capable of understanding and generating content from both images and videos. With its help, AMMMICO supports tasks such as image/video summarization and image/video visual question answering, where the model answers users' questions about the context of a media file. ### Emotion recognition From f277e86b29dca9a680560f485fdac313c1341c95 Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Fri, 24 Oct 2025 14:37:45 +0200 Subject: [PATCH 24/31] refactor: remove duplicated close method --- ammico/model.py | 6 +----- ammico/notebooks/DemoNotebook_ammico.ipynb | 4 +++- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/ammico/model.py b/ammico/model.py index cdc1161..8790123 100644 --- a/ammico/model.py +++ b/ammico/model.py @@ -97,7 +97,7 @@ class MultimodalSummaryModel: ) self.model.eval() - def _close(self) -> None: + def close(self) -> None: """Free model resources (helpful in long-running processes).""" try: if self.model is not None: @@ -120,7 +120,3 @@ class MultimodalSummaryModel: RuntimeWarning, stacklevel=2, ) - - def close(self) -> None: - """Free model resources (helpful in long-running processes).""" - self._close() diff --git a/ammico/notebooks/DemoNotebook_ammico.ipynb b/ammico/notebooks/DemoNotebook_ammico.ipynb index 9d9642d..8867699 100644 --- a/ammico/notebooks/DemoNotebook_ammico.ipynb +++ b/ammico/notebooks/DemoNotebook_ammico.ipynb @@ -1458,7 +1458,9 @@ "outputs": [], "source": [ "for key in image_dict.keys():\n", - " image_dict[key] = ammico.colors.ColorDetector(image_dict[key]).analyse_image()" + " image_dict[key] = ammico.colors.ColorDetector(image_dict[key]).analyse_image()\n", + "\n", + "print(\"testing signature\")" ] }, { From 9d382b7b6b56d8f2b35e73c3f165fb3ba2be4be1 Mon Sep 17 00:00:00 2001 From: DimasfromLavoisier Date: Fri, 24 Oct 2025 15:34:55 +0200 Subject: [PATCH 25/31] update notebook content --- ammico/notebooks/DemoNotebook_ammico.ipynb | 731 ++++----------------- 1 file changed, 121 insertions(+), 610 deletions(-) diff --git a/ammico/notebooks/DemoNotebook_ammico.ipynb b/ammico/notebooks/DemoNotebook_ammico.ipynb index 9d9642d..c9578e2 100644 --- a/ammico/notebooks/DemoNotebook_ammico.ipynb +++ b/ammico/notebooks/DemoNotebook_ammico.ipynb @@ -664,38 +664,15 @@ "\n", "\n", "\n", - "This module is based on the [LAVIS](https://github.com/salesforce/LAVIS) library. Since the models can be quite large, an initial object is created which will load the necessary models into RAM/VRAM and then use them in the analysis. The user can specify the type of analysis to be performed using the `analysis_type` keyword. Setting it to `summary` will generate a caption (summary), `questions` will prepare answers (VQA) to a list of questions as set by the user, `summary_and_questions` will do both. Note that the desired analysis type needs to be set here in the initialization of the \n", - "detector object, and not when running the analysis for each image; the same holds true for the selected model." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The implemented models are listed below.\n", + "### Multimodal Summary Model\n", "\n", - "| input model name | model |\n", - "| ---------------- | ----- |\n", - "| base | BLIP image captioning base, ViT-B/16, pretrained on COCO dataset |\n", - "| large | BLIP image captioning large, ViT-L/16, pretrained on COCO dataset |\n", - "| vqa | BLIP base model fine-tuned on VQA v2.0 dataset |\n", - "| blip2_t5_pretrain_flant5xxl | BLIP2 pretrained on FlanT5XXL | \n", - "| blip2_t5_pretrain_flant5xl | BLIP2 pretrained on FlanT5XL | \n", - "| blip2_t5_caption_coco_flant5xl | BLIP2 pretrained on FlanT5XL, fine-tuned on COCO | \n", - "| blip2_opt_pretrain_opt2.7b | BLIP2 pretrained on OPT-2.7b |\n", - "| blip2_opt_pretrain_opt6.7b | BLIP2 pretrained on OPT-6.7b | \n", - "| blip2_opt_caption_coco_opt2.7b | BLIP2 pretrained on OPT-2.7b, fine-tuned on COCO | \n", - "| blip2_opt_caption_coco_opt6.7b | BLIP2 pretrained on OPT-6.7b, fine-tuned on COCO |\n", + "This module is built on the Qwen2.5-VL model family. In this project, two model variants are supported: \n", "\n", - "Please note that `base`, `large` and `vqa` models can be run on the base TPU video card in Google Colab.\n", - "To run any advanced `BLIP2` models you need more than 20 gb of video memory, so you need to connect a paid A100 in Google Colab." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First of all, we can run only the summary module `analysis_type`. You can choose a `base` or a `large` model_type. " + "1. `Qwen2.5-VL-3B-Instruct`, which requires approximately 3 GB of video memory to load.\n", + "2. `Qwen2.5-VL-7B-Instruct`, which requires up to 8 GB of VRAM for initialization.\n", + "\n", + "Each version can be run on the CPU, but this will significantly increase the operating time, so we cannot recommend it, but we retain this option. \n", + "The model type can be specified when initializing the `MultimodalSummaryModel` class:" ] }, { @@ -704,32 +681,120 @@ "metadata": {}, "outputs": [], "source": [ - "image_summary_detector = ammico.SummaryDetector(\n", - " image_dict, analysis_type=\"summary\", model_type=\"base\"\n", + "model = ammico.MultimodalSummaryModel(\n", + " model_id=\"Qwen/Qwen2.5-VL-7B-Instruct\"\n", + ") # or \"Qwen/Qwen2.5-VL-3B-Instruct\" respectively" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also define the preferred device type (\"cpu\" or \"cuda\") explicitly during initialization:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = ammico.MultimodalSummaryModel(\n", + " model_id=\"Qwen/Qwen2.5-VL-7B-Instruct\", device=\"cuda\"\n", ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, the initialization follows this logic:\n", + "\n", + "If a GPU is available, it is automatically detected and the model defaults to Qwen2.5-VL-7B-Instruct on \"cuda\".\n", + "\n", + "If no GPU is detected, the system falls back to the Qwen2.5-VL-3B-Instruct model on the \"cpu\" device." + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "for num, key in tqdm(enumerate(image_dict.keys()), total=len(image_dict)):\n", - " image_dict[key] = image_summary_detector.analyse_image(\n", - " subdict=image_dict[key], analysis_type=\"summary\"\n", - " )\n", - "\n", - " if num % dump_every == 0 | num == len(image_dict) - 1:\n", - " image_df = ammico.get_dataframe(image_dict)\n", - " image_df.to_csv(dump_file)" + "model = ammico.MultimodalSummaryModel()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "For VQA, a list of questions needs to be passed when carrying out the analysis; these should be given as a list of strings." + "### Image Summary and VQA module\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To instantiate class it is required to provide `MultimodalSummaryModel` and dictionary" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "image_summary_vqa = ammico.ImageSummaryDetector(summary_model=model, subdict=image_dict)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To perform image analysis, use the analyse_images_from_dict() method.\n", + "This function provides flexible options for generating summaries and performing visual question answering. \n", + "1. `analysis_type` – defines the type of analysis to perform. Setting it to `summary` will generate a caption (summary), `questions` will prepare answers (VQA) to a list of questions as set by the user, `summary_and_questions` will do both.\n", + "2. `list_of_questions` a list of text questions to be answered by the model. This parameter is required when analysis_type is set to \"questions\" or \"summary_and_questions\".\n", + "3. `keys_batch_size` controls the number of images processed per batch. Increasing this value may slightly improve performance, depending on your system.\n", + "The default is `16`, which provides a good balance between speed and stability on most setups.\n", + "4. `is_concise_summary` – determines the level of detail in generated captions:\n", + " * `True` → produces short, concise summaries.\n", + " * `False` → produces longer, more descriptive captions that may include additional context or atmosphere, but take more time to compute.\n", + "5. `is_concise_answer`– similar to the previous flag, but for controlling the level of detail in question answering responses." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Example Usage**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To generate a concise image summary only:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "summary = ammico.analyse_images_from_dict(\n", + " analysis_type=\"summary\", is_concise_summary=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To generate detailed summaries and answer multiple questions:\n", + "\n", + "First, define a list of questions:" ] }, { @@ -749,7 +814,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "If you want to execute only the VQA module without captioning, just specify the `analysis_type` as `questions` and `model_type` as `vqa`. " + "Then call the function:" ] }, { @@ -758,232 +823,40 @@ "metadata": {}, "outputs": [], "source": [ - "image_summary_vqa_detector = ammico.SummaryDetector(\n", - " image_dict, analysis_type=\"questions\", model_type=\"vqa\"\n", - ")\n", - "\n", - "for num, key in tqdm(enumerate(image_dict.keys()), total=len(image_dict)):\n", - " image_dict[key] = image_summary_vqa_detector.analyse_image(\n", - " subdict=image_dict[key],\n", - " analysis_type=\"questions\",\n", - " list_of_questions=list_of_questions,\n", - " )\n", - " if num % dump_every == 0 | num == len(image_dict) - 1:\n", - " image_df = ammico.get_dataframe(image_dict)\n", - " image_df.to_csv(dump_file)" + "summary_and_answers = ammico.analyse_images_from_dict(\n", + " analysis_type=\"summary_and_questions\",\n", + " list_of_questions=list_of_questions,\n", + " is_concise_summary=False,\n", + " is_concise_answer=False,\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Or you can specify the analysis type as `summary_and_questions`, then both caption creation and question answers will be generated for each image. In this case, you can choose a `base` or a `large` model_type. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "image_summary_vqa_detector = ammico.SummaryDetector(\n", - " image_dict, analysis_type=\"summary_and_questions\", model_type=\"base\"\n", - ")\n", - "for num, key in tqdm(enumerate(image_dict.keys()), total=len(image_dict)):\n", - " image_dict[key] = image_summary_vqa_detector.analyse_image(\n", - " subdict=image_dict[key],\n", - " analysis_type=\"summary_and_questions\",\n", - " list_of_questions=list_of_questions,\n", - " )\n", - " if num % dump_every == 0 | num == len(image_dict) - 1:\n", - " image_df = ammico.get_dataframe(image_dict)\n", - " image_df.to_csv(dump_file)" + "If you want to execute only the VQA module without captioning, just specify the `analysis_type` as `questions`." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The output is given as a dictionary with the following keys and data types:\n", + "The output of the `analyse_images_from_dict()` method is a dictionary, where each key corresponds to an input image identifier. Each entry in this dictionary contains the processed results for that image.\n", "\n", "| output key | output type | output value |\n", "| ---------- | ----------- | ------------ |\n", - "| `const_image_summary` | `str` | when `analysis_type=\"summary\"` or `\"summary_and_questions\"`, constant image caption (does not change upon re-running the analysis for the same model) |\n", - "| `3_non-deterministic_summary` | `list[str]` | when `analysis_type=\"summary\"` or `summary_and_questions`, three different captions generated with different random seeds |\n", - "| *a user-defined input question* | `str` | when `analysis_type=\"questions\"` or `summary_and_questions`, the answer to the user-defined input question | \n" + "| `caption` | `str` | when `analysis_type=\"summary\"` or `\"summary_and_questions\"`, constant image caption |\n", + "| `vqa` | `list[str]` | when `analysis_type=\"questions\"` or `summary_and_questions`, the answers to the user-defined input question |\n", + "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### BLIP2 models\n", - "The BLIP2 models are computationally very heavy models, and require approximately 60GB of RAM. These models can easily use more than 20GB GPU memory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "obj = ammico.SummaryDetector(\n", - " subdict=image_dict,\n", - " analysis_type=\"summary_and_questions\",\n", - " model_type=\"blip2_t5_caption_coco_flant5xl\",\n", - ")\n", - "# list of the new models that can be used:\n", - "# \"blip2_t5_pretrain_flant5xxl\",\n", - "# \"blip2_t5_pretrain_flant5xl\",\n", - "# \"blip2_t5_caption_coco_flant5xl\",\n", - "# \"blip2_opt_pretrain_opt2.7b\",\n", - "# \"blip2_opt_pretrain_opt6.7b\",\n", - "# \"blip2_opt_caption_coco_opt2.7b\",\n", - "# \"blip2_opt_caption_coco_opt6.7b\",\n", - "\n", - "# You can use `pretrain_` model types for zero-shot image-to-text generation with prompts.\n", - "# Or you can use `caption_coco_`` model types to generate coco-style captions.\n", - "# `flant5` and `opt` means that the model equipped with FlanT5 and OPT LLMs respectively.\n", - "\n", - "# also you can perform all calculation on cpu if you set device_type= \"cpu\" or gpu if you set device_type= \"cuda\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also pass a list of questions to this cell if `analysis_type=\"summary_and_questions\"` or `analysis_type=\"questions\"`. But the format of questions has changed in new models. \n", - "\n", - "Here is an example of a list of questions:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "list_of_questions = [\n", - " \"Question: Are there people in the image? Answer:\",\n", - " \"Question: What is this picture about? Answer:\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for key in image_dict:\n", - " image_dict[key] = obj.analyse_image(\n", - " subdict=image_dict[key],\n", - " analysis_type=\"questions\",\n", - " list_of_questions=list_of_questions,\n", - " )\n", - "\n", - "# analysis_type can be\n", - "# \"summary\",\n", - "# \"questions\",\n", - "# \"summary_and_questions\"." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also pass a question with previous answers as context into this model and pass in questions like this one to get a more accurate answer:\n", - "\n", - "You can combine as many questions as you want in a single query as a list." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "list_of_questions = [\n", - " \"Question: What country is in the picture? Answer: USA. Question: Why? Answer: Because there is an American flag in the background . Question: Where it comes from? Answer:\",\n", - " \"Question: Which city is this? Answer: Frankfurt. Question: Why?\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for key in image_dict:\n", - " image_dict[key] = obj.analyse_image(\n", - " subdict=image_dict[key],\n", - " analysis_type=\"questions\",\n", - " list_of_questions=list_of_questions,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "image_dict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also ask sequential questions if you pass the argument `consequential_questions=True`. This means that the answers to previous questions will be passed as context to the next question. However, this method will work a bit slower, because for each image the answers to the questions will not be calculated simultaneously, but sequentially. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "list_of_questions = [\n", - " \"Question: Is this picture taken inside or outside? Answer:\",\n", - " \"Question: Why? Answer:\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for key in image_dict:\n", - " image_dict[key] = obj.analyse_image(\n", - " subdict=image_dict[key],\n", - " analysis_type=\"questions\",\n", - " list_of_questions=list_of_questions,\n", - " consequential_questions=True,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "image_dict" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# write output to csv\n", - "image_df = ammico.get_dataframe(image_dict)\n", - "image_df.to_csv(\"/content/drive/MyDrive/misinformation-data/data_out.csv\")" + "### Video summary and VQA module\n", + "This module is currently under development and will be demonstrated here as soon as it is ready." ] }, { @@ -1069,369 +942,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This module shows how to carry out an image multimodal search with the [LAVIS](https://github.com/salesforce/LAVIS) library. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Indexing and extracting features from images in selected folder" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First you need to select a model. You can choose one of the following models: \n", - "- [blip](https://github.com/salesforce/BLIP)\n", - "- [blip2](https://huggingface.co/docs/transformers/main/model_doc/blip-2) \n", - "- [albef](https://github.com/salesforce/ALBEF) \n", - "- [clip_base](https://github.com/openai/CLIP/blob/main/model-card.md)\n", - "- [clip_vitl14](https://github.com/mlfoundations/open_clip) \n", - "- [clip_vitl14_336](https://github.com/mlfoundations/open_clip)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model_type = \"blip\"\n", - "# model_type = \"blip2\"\n", - "# model_type = \"albef\"\n", - "# model_type = \"clip_base\"\n", - "# model_type = \"clip_vitl14\"\n", - "# model_type = \"clip_vitl14_336\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To process the loaded images using the selected model, use the below code:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "image_dict = ammico.find_files(\n", - " path=str(data_path),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "image_dict" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "my_obj = ammico.MultimodalSearch(image_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "(\n", - " model,\n", - " vis_processors,\n", - " txt_processors,\n", - " image_keys,\n", - " image_names,\n", - " features_image_stacked,\n", - ") = my_obj.parsing_images(\n", - " model_type,\n", - " path_to_save_tensors=str(data_path),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The images are then processed and stored in a numerical representation, a tensor. These tensors do not change for the same image and same model - so if you run this analysis once, and save the tensors giving a path with the keyword `path_to_save_tensors`, a file with filename `.__saved_features_image.pt` will be placed there.\n", - "\n", - "This can save you time if you want to analyse the same images with the same model but different questions. To run using the saved tensors, execute the below code giving the path and name of the tensor file. Any subsequent query of the model will run in a fraction of the time than it run in initially." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# uncomment the code below if you want to load the tensors from the drive\n", - "# and just want to ask different questions for the same set of images\n", - "# (\n", - "# model,\n", - "# vis_processors,\n", - "# txt_processors,\n", - "# image_keys,\n", - "# image_names,\n", - "# features_image_stacked,\n", - "# ) = my_obj.parsing_images(\n", - "# model_type,\n", - "# path_to_load_tensors=\"/content/drive/MyDrive/misinformation-data/5_clip_base_saved_features_image.pt\",\n", - "# )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here we already processed our image folder with 5 images and the `clip_base` model. So you need just to write the name `5_clip_base_saved_features_image.pt` of the saved file that consists of tensors of all images as keyword argument for `path_to_load_tensors`. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Formulate your search queries\n", - "\n", - "Next, you need to form search queries. You can search either by image or by text. You can search for a single query, or you can search for several queries at once, the computational time should not be much different. The format of the queries is as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import importlib_resources # only require for image query example\n", - "\n", - "image_example_query = str(\n", - " importlib_resources.files(\"ammico\") / \"data\" / \"test-crop-image.png\"\n", - ") # creating the path to the image for the image query example\n", - "\n", - "search_query = [\n", - " {\n", - " \"image\": image_example_query\n", - " }, # This is how looks image query, here `image_example_path` is the path to query image like \"data/test-crop-image.png\"\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can filter your results in 3 different ways:\n", - "- `filter_number_of_images` limits the number of images found. That is, if the parameter `filter_number_of_images = 10`, then the first 10 images that best match the query will be shown. The other images ranks will be set to `None` and the similarity value to `0`.\n", - "- `filter_val_limit` limits the output of images with a similarity value not bigger than `filter_val_limit`. That is, if the parameter `filter_val_limit = 0.2`, all images with similarity less than 0.2 will be discarded.\n", - "- `filter_rel_error` (percentage) limits the output of images with a similarity value not bigger than `100 * abs(current_similarity_value - best_similarity_value_in_current_search)/best_similarity_value_in_current_search < filter_rel_error`. That is, if we set filter_rel_error = 30, it means that if the top1 image have 0.5 similarity value, we discard all image with similarity less than 0.35." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "similarity, sorted_lists = my_obj.multimodal_search(\n", - " model,\n", - " vis_processors,\n", - " txt_processors,\n", - " model_type,\n", - " image_keys,\n", - " features_image_stacked,\n", - " search_query,\n", - " filter_number_of_images=20,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "similarity" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sorted_lists" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After launching `multimodal_search` function, the results of each query will be added to the source dictionary. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "image_dict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A special function was written to present the search results conveniently. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "my_obj.show_results(\n", - " search_query[0], # you can change the index to see the results for other queries\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Formulate your search queries: Search for the best match using multiple reference images, for example, of a person" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Here goes the code that reads in multiple images as reference\n", - "# then you will loop over these multiple images and find the best matches\n", - "# in the end, the best matches will be averaged over for each picture and a list of averaged best matches will be provided" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Improve the search results: Use only for text queries, not image search\n", - "\n", - "For even better results, a slightly different approach has been prepared that can improve search results. It is quite resource-intensive, so it is applied after the main algorithm has found the most relevant images. This approach works only with text queries and it skips image queries. Among the parameters you can choose 3 models: `\"blip_base\"`, `\"blip_large\"`, `\"blip2_coco\"`. If you get an `Out of Memory` error, try reducing the batch_size value (minimum = 1), which is the number of images being processed simultaneously. With the parameter `need_grad_cam = True/False` you can enable the calculation of the heat map of each image to be processed and save them in `image_gradcam_with_itm`. Thus the `image_text_match_reordering()` function calculates new similarity values and new ranks for each image. The resulting values are added to the general dictionary." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "itm_model = \"blip_base\"\n", - "# itm_model = \"blip_large\"\n", - "# itm_model = \"blip2_coco\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "itm_scores, image_gradcam_with_itm = my_obj.image_text_match_reordering(\n", - " search_query,\n", - " itm_model,\n", - " image_keys,\n", - " sorted_lists,\n", - " batch_size=1,\n", - " need_grad_cam=True,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then using the same output function you can add the `itm=True` argument to output the new image order. Remember that for images queries, an error will be thrown with `itm=True` argument. You can also add the `image_gradcam_with_itm` along with `itm=True` argument to output the heat maps of the calculated images." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "my_obj.show_results(\n", - " search_query[0], itm=True, image_gradcam_with_itm=image_gradcam_with_itm\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save search results to csv" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Convert the dictionary of dictionaries into a dictionary with lists:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "outdict = ammico.append_data_to_dict(image_dict)\n", - "df = ammico.dump_df(outdict)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check the dataframe:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Write the csv file:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.to_csv(\"/content/drive/MyDrive/misinformation-data/data_out.csv\")" + "This module is currently under development and will be demonstrated here as soon as it is ready." ] }, { From de8ee83432af2cdd8982cc1030c6ab6cf923169a Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Fri, 24 Oct 2025 16:36:03 +0200 Subject: [PATCH 26/31] refactor: use dictionary mapping for values, check question list strings for None --- ammico/image_summary.py | 72 +++++++++++++++++++++++-------- ammico/test/test_image_summary.py | 21 +++++++++ 2 files changed, 74 insertions(+), 19 deletions(-) diff --git a/ammico/image_summary.py b/ammico/image_summary.py index 203ef21..7d462f2 100644 --- a/ammico/image_summary.py +++ b/ammico/image_summary.py @@ -13,6 +13,22 @@ from qwen_vl_utils import process_vision_info class ImageSummaryDetector(AnalysisMethod): + token_prompt_config = { + "default": { + "summary": {"prompt": "Describe this image.", "max_new_tokens": 256}, + "questions": {"prompt": "", "max_new_tokens": 128}, + }, + "concise": { + "summary": { + "prompt": "Describe this image in one concise caption.", + "max_new_tokens": 64, + }, + "questions": {"prompt": "Answer concisely: ", "max_new_tokens": 128}, + }, + } + MAX_QUESTIONS_PER_IMAGE = 32 + KEYS_BATCH_SIZE = 16 + def __init__( self, summary_model: MultimodalSummaryModel, @@ -155,7 +171,7 @@ class ImageSummaryDetector(AnalysisMethod): entry: dict, analysis_type: Union[str, AnalysisType] = AnalysisType.SUMMARY_AND_QUESTIONS, list_of_questions: Optional[List[str]] = None, - max_questions_per_image: int = 32, + max_questions_per_image: int = MAX_QUESTIONS_PER_IMAGE, is_concise_summary: bool = True, is_concise_answer: bool = True, ) -> Dict[str, Any]: @@ -197,8 +213,8 @@ class ImageSummaryDetector(AnalysisMethod): self, analysis_type: Union[AnalysisType, str] = AnalysisType.SUMMARY_AND_QUESTIONS, list_of_questions: Optional[List[str]] = None, - max_questions_per_image: int = 32, - keys_batch_size: int = 16, + max_questions_per_image: int = MAX_QUESTIONS_PER_IMAGE, + keys_batch_size: int = KEYS_BATCH_SIZE, is_concise_summary: bool = True, is_concise_answer: bool = True, ) -> Dict[str, dict]: @@ -267,12 +283,12 @@ class ImageSummaryDetector(AnalysisMethod): Returns: results (list[str]): list of generated captions. """ - if is_concise_summary: - prompt = ["Describe this image in one concise caption."] - max_new_tokens = 64 - else: - prompt = ["Describe this image."] - max_new_tokens = 256 + prompt = self.token_prompt_config[ + "concise" if is_concise_summary else "default" + ]["summary"]["prompt"] + max_new_tokens = self.token_prompt_config[ + "concise" if is_concise_summary else "default" + ]["summary"]["max_new_tokens"] inputs = self._prepare_inputs(prompt, entry) gen_conf = GenerationConfig( @@ -333,6 +349,24 @@ class ImageSummaryDetector(AnalysisMethod): results = [d.strip() for d in decoded] return results + def _clean_list_of_questions( + self, list_of_questions: list[str], prompt: str + ) -> list[str]: + """Clean the list of questions to contain correctly formatted strings.""" + # remove all None or empty questions + list_of_questions = [i for i in list_of_questions if i and i.strip()] + # ensure each question ends with a question mark + list_of_questions = [ + i.strip() + "?" if not i.strip().endswith("?") else i.strip() + for i in list_of_questions + ] + # ensure each question starts with the prompt + list_of_questions = [ + i if i.lower().startswith(prompt.lower()) else prompt + i + for i in list_of_questions + ] + return list_of_questions + def answer_questions( self, list_of_questions: list[str], @@ -348,15 +382,15 @@ class ImageSummaryDetector(AnalysisMethod): Returns: answers (list[str]): list of answers. """ - if is_concise_answer: - gen_conf = GenerationConfig(max_new_tokens=64, do_sample=False) - for i in range(len(list_of_questions)): - if not list_of_questions[i].strip().endswith("?"): - list_of_questions[i] = list_of_questions[i].strip() + "?" - if not list_of_questions[i].lower().startswith("answer concisely"): - list_of_questions[i] = "Answer concisely: " + list_of_questions[i] - else: - gen_conf = GenerationConfig(max_new_tokens=128, do_sample=False) + prompt = self.token_prompt_config[ + "concise" if is_concise_answer else "default" + ]["answer"]["prompt"] + max_new_tokens = self.token_prompt_config[ + "concise" if is_concise_answer else "default" + ]["answer"]["max_new_tokens"] + + list_of_questions = self._clean_list_of_questions(list_of_questions, prompt) + gen_conf = GenerationConfig(max_new_tokens=max_new_tokens, do_sample=False) question_chunk_size = 8 answers: List[str] = [] @@ -396,7 +430,7 @@ class ImageSummaryDetector(AnalysisMethod): if len(answers) != len(list_of_questions): raise ValueError( - f"Expected {len(list_of_questions)} answers, but got {len(answers)}, try vary amount of questions" + f"Expected {len(list_of_questions)} answers, but got {len(answers)}, try varying amount of questions" ) return answers diff --git a/ammico/test/test_image_summary.py b/ammico/test/test_image_summary.py index ad48298..b56d806 100644 --- a/ammico/test/test_image_summary.py +++ b/ammico/test/test_image_summary.py @@ -35,3 +35,24 @@ def test_image_summary_detector_questions(model, get_testdict): assert ( "two" in results[key]["vqa"][1].lower() or "2" in results[key]["vqa"][1] ) + + +def test_clean_list_of_questions(model): + list_of_questions = [ + "What is happening in the image?", + "", + " ", + None, + "How many cars are in the image in total", + ] + detector = ImageSummaryDetector(summary_model=model, subdict={}) + prompt = detector.token_prompt_config["default"]["questions"]["prompt"] + cleaned_questions = detector._clean_list_of_questions(list_of_questions, prompt) + assert len(cleaned_questions) == 2 + assert cleaned_questions[0] == "What is happening in the image?" + assert cleaned_questions[1] == "How many cars are in the image in total?" + prompt = detector.token_prompt_config["concise"]["questions"]["prompt"] + cleaned_questions = detector._clean_list_of_questions(list_of_questions, prompt) + assert len(cleaned_questions) == 2 + assert cleaned_questions[0] == prompt + "What is happening in the image?" + assert cleaned_questions[1] == prompt + "How many cars are in the image in total?" From 731077be7d74f8a644bcec7b1c7608000e7d337f Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Mon, 27 Oct 2025 09:42:12 +0100 Subject: [PATCH 27/31] test: add mock model for summary testing --- ammico/test/TESTING_WITH_MOCKS.md | 82 +++++++++++++++++++++++++++++++ ammico/test/conftest.py | 48 ++++++++++++++++++ ammico/test/test_image_summary.py | 71 +++++++++++++++++++++++++- 3 files changed, 199 insertions(+), 2 deletions(-) create mode 100644 ammico/test/TESTING_WITH_MOCKS.md diff --git a/ammico/test/TESTING_WITH_MOCKS.md b/ammico/test/TESTING_WITH_MOCKS.md new file mode 100644 index 0000000..3b406da --- /dev/null +++ b/ammico/test/TESTING_WITH_MOCKS.md @@ -0,0 +1,82 @@ +# Testing with Mock Models + +This document explains how to use the mock model fixture to write fast unit tests that don't require loading the actual model. + +## Mock Model Fixture + +A `mock_model` fixture has been added to `conftest.py` that creates a lightweight mock of the `MultimodalSummaryModel` class. This fixture: + +- **Does not load any actual models** (super fast) +- **Mocks all necessary methods** (processor, tokenizer, model.generate, etc.) +- **Returns realistic tensor shapes** (so the code doesn't crash) +- **Can be used for fast unit tests** that don't need actual model inference + +## Usage + +Simply use `mock_model` instead of `model` in your test fixtures: + +```python +def test_my_feature(mock_model): + detector = ImageSummaryDetector(summary_model=mock_model, subdict={}) + # Your test code here + pass +``` + +## When to Use Mock vs Real Model + +### Use `mock_model` when: +- Testing utility functions (like `_clean_list_of_questions`) +- Testing input validation logic +- Testing data processing methods +- Testing class initialization +- **Any test that doesn't need actual model inference** + +### Use `model` (real model) when: +- Testing end-to-end functionality +- Testing actual caption generation quality +- Testing actual question answering +- Integration tests that verify model behavior +- **Any test marked with `@pytest.mark.long`** + +## Example Tests Added + +The following new tests use the mock model: + +1. `test_image_summary_detector_init_mock` - Tests initialization +2. `test_load_pil_if_needed_string` - Tests image loading +3. `test_is_sequence_but_not_str` - Tests utility methods +4. `test_validate_analysis_type` - Tests validation logic + +All of these run quickly without loading the model. + +## Running Tests + +### Run only fast tests (with mocks): +```bash +pytest ammico/test/test_image_summary.py -v +``` + +### Run only long tests (with real model): +```bash +pytest ammico/test/test_image_summary.py -m long -v +``` + +### Run all tests: +```bash +pytest ammico/test/test_image_summary.py -v +``` + +## Customizing the Mock + +If you need to customize the mock's behavior for specific tests, you can override its methods: + +```python +def test_custom_behavior(mock_model): + # Customize the mock's return value + mock_model.tokenizer.batch_decode.return_value = ["custom", "output"] + + detector = ImageSummaryDetector(summary_model=mock_model, subdict={}) + # Test with custom behavior + pass +``` + diff --git a/ammico/test/conftest.py b/ammico/test/conftest.py index 2010e1e..c2d13a5 100644 --- a/ammico/test/conftest.py +++ b/ammico/test/conftest.py @@ -1,5 +1,6 @@ import os import pytest +from unittest.mock import Mock, MagicMock from ammico.model import MultimodalSummaryModel @@ -56,3 +57,50 @@ def model(): yield m finally: m.close() + + +@pytest.fixture +def mock_model(): + """ + Mock model fixture that doesn't load the actual model. + Useful for faster unit tests that don't need actual model inference. + """ + import torch + + # Create a mock model object + mock_model_obj = MagicMock(spec=["generate", "eval"]) + mock_model_obj.device = "cpu" + mock_model_obj.eval = MagicMock(return_value=mock_model_obj) + + # Create mock processor with necessary methods + mock_processor = MagicMock() + mock_processor.apply_chat_template = MagicMock( + side_effect=lambda messages, **kwargs: "processed_text" + ) + + # Mock processor to return tensor-like inputs + def mock_processor_call(text, images, **kwargs): + batch_size = len(text) if isinstance(text, list) else 1 + return { + "input_ids": torch.randint(0, 1000, (batch_size, 10)), + "pixel_values": torch.randn(batch_size, 3, 224, 224), + "attention_mask": torch.ones(batch_size, 10), + } + + mock_processor.__call__ = MagicMock(side_effect=mock_processor_call) + + # Create mock tokenizer + mock_tokenizer = MagicMock() + mock_tokenizer.batch_decode = MagicMock( + side_effect=lambda ids, **kwargs: ["mock caption" for _ in range(len(ids))] + ) + + # Create the mock model instance + mock_m = Mock() + mock_m.model = mock_model_obj + mock_m.processor = mock_processor + mock_m.tokenizer = mock_tokenizer + mock_m.device = "cpu" + mock_m.close = MagicMock() + + return mock_m diff --git a/ammico/test/test_image_summary.py b/ammico/test/test_image_summary.py index b56d806..475fbac 100644 --- a/ammico/test/test_image_summary.py +++ b/ammico/test/test_image_summary.py @@ -37,7 +37,7 @@ def test_image_summary_detector_questions(model, get_testdict): ) -def test_clean_list_of_questions(model): +def test_clean_list_of_questions(mock_model): list_of_questions = [ "What is happening in the image?", "", @@ -45,7 +45,7 @@ def test_clean_list_of_questions(model): None, "How many cars are in the image in total", ] - detector = ImageSummaryDetector(summary_model=model, subdict={}) + detector = ImageSummaryDetector(summary_model=mock_model, subdict={}) prompt = detector.token_prompt_config["default"]["questions"]["prompt"] cleaned_questions = detector._clean_list_of_questions(list_of_questions, prompt) assert len(cleaned_questions) == 2 @@ -56,3 +56,70 @@ def test_clean_list_of_questions(model): assert len(cleaned_questions) == 2 assert cleaned_questions[0] == prompt + "What is happening in the image?" assert cleaned_questions[1] == prompt + "How many cars are in the image in total?" + + +# Fast tests using mock model (no actual model loading) +def test_image_summary_detector_init_mock(mock_model, get_testdict): + """Test detector initialization with mocked model.""" + detector = ImageSummaryDetector(summary_model=mock_model, subdict=get_testdict) + assert detector.summary_model is mock_model + assert len(detector.subdict) == 2 + + +def test_load_pil_if_needed_string(mock_model): + """Test loading image from file path.""" + detector = ImageSummaryDetector(summary_model=mock_model) + # This will try to actually load a file, so we'll use a test image + import os + + test_image_path = os.path.join(os.path.dirname(__file__), "data", "IMG_2746.png") + if os.path.exists(test_image_path): + img = detector._load_pil_if_needed(test_image_path) + from PIL import Image + + assert isinstance(img, Image.Image) + assert img.mode == "RGB" + + +def test_is_sequence_but_not_str(mock_model): + """Test sequence detection utility.""" + detector = ImageSummaryDetector(summary_model=mock_model) + assert detector._is_sequence_but_not_str([1, 2, 3]) is True + assert detector._is_sequence_but_not_str("string") is False + assert detector._is_sequence_but_not_str(b"bytes") is False + assert ( + detector._is_sequence_but_not_str({"a": 1}) is False + ) # dict is sequence-like but not handled as such + + +def test_validate_analysis_type(mock_model): + """Test analysis type validation.""" + detector = ImageSummaryDetector(summary_model=mock_model) + # Test valid types + _, _, is_summary, is_questions = detector._validate_analysis_type( + "summary", None, 10 + ) + assert is_summary is True + assert is_questions is False + + _, _, is_summary, is_questions = detector._validate_analysis_type( + "questions", ["What is this?"], 10 + ) + assert is_summary is False + assert is_questions is True + + _, _, is_summary, is_questions = detector._validate_analysis_type( + "summary_and_questions", ["What is this?"], 10 + ) + assert is_summary is True + assert is_questions is True + + # Test invalid type + with pytest.raises(ValueError): + detector._validate_analysis_type("invalid", None, 10) + + # Test too many questions + with pytest.raises(ValueError): + detector._validate_analysis_type( + "questions", ["Q" + str(i) for i in range(33)], 32 + ) From 237c6265fe7148f4897be7f5ab86f913f58a3e32 Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Mon, 27 Oct 2025 09:49:41 +0100 Subject: [PATCH 28/31] test: some small changes to mock model --- ...ING_WITH_MOCKS.md => TESTING_WITH_MOCKS.md | 0 ammico/test/conftest.py | 79 +++++++++++-------- ammico/test/test_image_summary.py | 7 +- 3 files changed, 48 insertions(+), 38 deletions(-) rename ammico/test/TESTING_WITH_MOCKS.md => TESTING_WITH_MOCKS.md (100%) diff --git a/ammico/test/TESTING_WITH_MOCKS.md b/TESTING_WITH_MOCKS.md similarity index 100% rename from ammico/test/TESTING_WITH_MOCKS.md rename to TESTING_WITH_MOCKS.md diff --git a/ammico/test/conftest.py b/ammico/test/conftest.py index c2d13a5..f4c8040 100644 --- a/ammico/test/conftest.py +++ b/ammico/test/conftest.py @@ -1,7 +1,7 @@ import os import pytest -from unittest.mock import Mock, MagicMock from ammico.model import MultimodalSummaryModel +import torch @pytest.fixture @@ -65,42 +65,55 @@ def mock_model(): Mock model fixture that doesn't load the actual model. Useful for faster unit tests that don't need actual model inference. """ - import torch - # Create a mock model object - mock_model_obj = MagicMock(spec=["generate", "eval"]) - mock_model_obj.device = "cpu" - mock_model_obj.eval = MagicMock(return_value=mock_model_obj) + class MockProcessor: + """Mock processor that mimics AutoProcessor behavior.""" - # Create mock processor with necessary methods - mock_processor = MagicMock() - mock_processor.apply_chat_template = MagicMock( - side_effect=lambda messages, **kwargs: "processed_text" - ) + def apply_chat_template(self, messages, **kwargs): + return "processed_text" - # Mock processor to return tensor-like inputs - def mock_processor_call(text, images, **kwargs): - batch_size = len(text) if isinstance(text, list) else 1 - return { - "input_ids": torch.randint(0, 1000, (batch_size, 10)), - "pixel_values": torch.randn(batch_size, 3, 224, 224), - "attention_mask": torch.ones(batch_size, 10), - } + def __call__(self, text, images, **kwargs): + """Mock processing that returns tensor-like inputs.""" + batch_size = len(text) if isinstance(text, list) else 1 + return { + "input_ids": torch.randint(0, 1000, (batch_size, 10)), + "pixel_values": torch.randn(batch_size, 3, 224, 224), + "attention_mask": torch.ones(batch_size, 10), + } - mock_processor.__call__ = MagicMock(side_effect=mock_processor_call) + class MockTokenizer: + """Mock tokenizer that mimics AutoTokenizer behavior.""" - # Create mock tokenizer - mock_tokenizer = MagicMock() - mock_tokenizer.batch_decode = MagicMock( - side_effect=lambda ids, **kwargs: ["mock caption" for _ in range(len(ids))] - ) + def batch_decode(self, ids, **kwargs): + """Return mock captions for the given batch size.""" + batch_size = ids.shape[0] if hasattr(ids, "shape") else len(ids) + return ["mock caption" for _ in range(batch_size)] - # Create the mock model instance - mock_m = Mock() - mock_m.model = mock_model_obj - mock_m.processor = mock_processor - mock_m.tokenizer = mock_tokenizer - mock_m.device = "cpu" - mock_m.close = MagicMock() + class MockModelObj: + """Mock model object that mimics the model.generate behavior.""" - return mock_m + def __init__(self): + self.device = "cpu" + + def eval(self): + return self + + def generate(self, input_ids=None, **kwargs): + """Generate mock token IDs.""" + batch_size = input_ids.shape[0] if hasattr(input_ids, "shape") else 1 + return torch.randint(0, 1000, (batch_size, 20)) + + class MockMultimodalSummaryModel: + """Mock MultimodalSummaryModel that doesn't load actual models.""" + + def __init__(self): + self.model = MockModelObj() + self.processor = MockProcessor() + self.tokenizer = MockTokenizer() + self.device = "cpu" + + def close(self): + """Mock close method - no actual cleanup needed.""" + pass + + return MockMultimodalSummaryModel() diff --git a/ammico/test/test_image_summary.py b/ammico/test/test_image_summary.py index 475fbac..1d0dda2 100644 --- a/ammico/test/test_image_summary.py +++ b/ammico/test/test_image_summary.py @@ -1,5 +1,6 @@ from ammico.image_summary import ImageSummaryDetector - +import os +from PIL import Image import pytest @@ -70,13 +71,9 @@ def test_load_pil_if_needed_string(mock_model): """Test loading image from file path.""" detector = ImageSummaryDetector(summary_model=mock_model) # This will try to actually load a file, so we'll use a test image - import os - test_image_path = os.path.join(os.path.dirname(__file__), "data", "IMG_2746.png") if os.path.exists(test_image_path): img = detector._load_pil_if_needed(test_image_path) - from PIL import Image - assert isinstance(img, Image.Image) assert img.mode == "RGB" From a65f1e2287d6807f0fdb7fbf7bdea59e60715806 Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Mon, 27 Oct 2025 09:59:03 +0100 Subject: [PATCH 29/31] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- ammico/image_summary.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ammico/image_summary.py b/ammico/image_summary.py index 7d462f2..16aeea5 100644 --- a/ammico/image_summary.py +++ b/ammico/image_summary.py @@ -289,7 +289,7 @@ class ImageSummaryDetector(AnalysisMethod): max_new_tokens = self.token_prompt_config[ "concise" if is_concise_summary else "default" ]["summary"]["max_new_tokens"] - inputs = self._prepare_inputs(prompt, entry) + inputs = self._prepare_inputs([prompt], entry) gen_conf = GenerationConfig( max_new_tokens=max_new_tokens, @@ -384,10 +384,10 @@ class ImageSummaryDetector(AnalysisMethod): """ prompt = self.token_prompt_config[ "concise" if is_concise_answer else "default" - ]["answer"]["prompt"] + ]["questions"]["prompt"] max_new_tokens = self.token_prompt_config[ "concise" if is_concise_answer else "default" - ]["answer"]["max_new_tokens"] + ]["questions"]["max_new_tokens"] list_of_questions = self._clean_list_of_questions(list_of_questions, prompt) gen_conf = GenerationConfig(max_new_tokens=max_new_tokens, do_sample=False) From 8e9f2b6d8732e4eca1eeb949da40789f3e3f90d3 Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Mon, 27 Oct 2025 10:01:08 +0100 Subject: [PATCH 30/31] Update ammico/notebooks/DemoNotebook_ammico.ipynb Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- ammico/notebooks/DemoNotebook_ammico.ipynb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ammico/notebooks/DemoNotebook_ammico.ipynb b/ammico/notebooks/DemoNotebook_ammico.ipynb index 4c9fd62..f175bf6 100644 --- a/ammico/notebooks/DemoNotebook_ammico.ipynb +++ b/ammico/notebooks/DemoNotebook_ammico.ipynb @@ -970,8 +970,7 @@ "source": [ "for key in image_dict.keys():\n", " image_dict[key] = ammico.colors.ColorDetector(image_dict[key]).analyse_image()\n", - "\n", - "print(\"testing signature\")" + "\n" ] }, { From 4a18b1b5a99cb7d6cb99171fb461189d7dd71758 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 27 Oct 2025 09:01:37 +0000 Subject: [PATCH 31/31] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ammico/notebooks/DemoNotebook_ammico.ipynb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ammico/notebooks/DemoNotebook_ammico.ipynb b/ammico/notebooks/DemoNotebook_ammico.ipynb index f175bf6..c9578e2 100644 --- a/ammico/notebooks/DemoNotebook_ammico.ipynb +++ b/ammico/notebooks/DemoNotebook_ammico.ipynb @@ -969,8 +969,7 @@ "outputs": [], "source": [ "for key in image_dict.keys():\n", - " image_dict[key] = ammico.colors.ColorDetector(image_dict[key]).analyse_image()\n", - "\n" + " image_dict[key] = ammico.colors.ColorDetector(image_dict[key]).analyse_image()" ] }, {