From 2326aef4b561e360918c5fa78a5cf6c322517df5 Mon Sep 17 00:00:00 2001 From: Dmitrii Kapitan Date: Mon, 22 Sep 2025 16:40:02 +0200 Subject: [PATCH] Add example notebook and small fixes --- ammico/__init__.py | 2 + ammico/image_summary.py | 1 - ammico/notebooks/DemoImageSummaryVQA.ipynb | 190 +++++++++++++++++++++ ammico/utils.py | 9 + 4 files changed, 201 insertions(+), 1 deletion(-) create mode 100644 ammico/notebooks/DemoImageSummaryVQA.ipynb diff --git a/ammico/__init__.py b/ammico/__init__.py index 9a25ade..1bf343d 100644 --- a/ammico/__init__.py +++ b/ammico/__init__.py @@ -2,6 +2,7 @@ from ammico.display import AnalysisExplorer from ammico.faces import EmotionDetector, ethical_disclosure from ammico.model import MultimodalSummaryModel from ammico.text import TextDetector, TextAnalyzer, privacy_disclosure +from ammico.image_summary import ImageSummaryDetector from ammico.utils import find_files, get_dataframe # Export the version defined in project metadata @@ -18,6 +19,7 @@ __all__ = [ "MultimodalSummaryModel", "TextDetector", "TextAnalyzer", + "ImageSummaryDetector", "find_files", "get_dataframe", "ethical_disclosure", diff --git a/ammico/image_summary.py b/ammico/image_summary.py index c4b2444..0cdaebe 100644 --- a/ammico/image_summary.py +++ b/ammico/image_summary.py @@ -9,7 +9,6 @@ import warnings from typing import List, Optional, Union, Dict, Any from collections.abc import Sequence as _Sequence from transformers import GenerationConfig -import re from qwen_vl_utils import process_vision_info diff --git a/ammico/notebooks/DemoImageSummaryVQA.ipynb b/ammico/notebooks/DemoImageSummaryVQA.ipynb new file mode 100644 index 0000000..b067e1f --- /dev/null +++ b/ammico/notebooks/DemoImageSummaryVQA.ipynb @@ -0,0 +1,190 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": {}, + "source": [ + "# Image summary and visual question answering" + ] + }, + { + "cell_type": "markdown", + "id": "1", + "metadata": {}, + "source": [ + "This notebook shows how to generate image captions and use the visual question answering with AMMICO. \n", + "\n", + "The first cell imports `ammico`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": {}, + "outputs": [], + "source": [ + "import ammico" + ] + }, + { + "cell_type": "markdown", + "id": "3", + "metadata": {}, + "source": [ + "The cell below loads the model for VQA tasks. By default, it loads a large model on the GPU (if your device supports CUDA), otherwise it loads a relatively smaller model on the CPU. But you can specify other settings (e.g., a small model on the GPU) if you want." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": {}, + "outputs": [], + "source": [ + "model = ammico.MultimodalSummaryModel()" + ] + }, + { + "cell_type": "markdown", + "id": "5", + "metadata": {}, + "source": [ + "Here you need to provide the path to your google drive folder or local folder containing the images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6", + "metadata": {}, + "outputs": [], + "source": [ + "image_dict = ammico.find_files(\n", + " path=str(\"/insert/your/path/here/\"),\n", + " limit=-1, # -1 means no limit on the number of files, by default it is set to 20\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "7", + "metadata": {}, + "source": [ + "The cell below creates an object that analyzes images and generates a summary using a specific model and image data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8", + "metadata": {}, + "outputs": [], + "source": [ + "img = ammico.ImageSummaryDetector(summary_model=model, subdict=image_dict)" + ] + }, + { + "cell_type": "markdown", + "id": "9", + "metadata": {}, + "source": [ + "## Image summary " + ] + }, + { + "cell_type": "markdown", + "id": "10", + "metadata": {}, + "source": [ + "To start your work with images, you should call the `analyse_images` method.\n", + "\n", + "You can specify what kind of analysis you want to perform with `analysis_type`. `\"summary\"` will generate a summary for all pictures in your dictionary, `\"questions\"` will prepare answers to your questions for all pictures, and `\"summary_and_questions\"` will do both.\n", + "\n", + "Parameter `\"is_concise_summary\"` regulates the length of an answer.\n", + "\n", + "Here we want to get a long summary on each object in our image dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": {}, + "outputs": [], + "source": [ + "summaries = img.analyse_images(analysis_type=\"summary\", is_concise_summary=False)" + ] + }, + { + "cell_type": "markdown", + "id": "12", + "metadata": {}, + "source": [ + "## VQA" + ] + }, + { + "cell_type": "markdown", + "id": "13", + "metadata": {}, + "source": [ + "In addition to analyzing images in `ammico`, the same model can be used in VQA mode. To do this, you need to define the questions that will be applied to all images from your dict." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14", + "metadata": {}, + "outputs": [], + "source": [ + "questions = [\"Are there any visible signs of violence?\", \"Is it safe to be there?\"]" + ] + }, + { + "cell_type": "markdown", + "id": "15", + "metadata": {}, + "source": [ + "Here is an example of VQA mode usage. You can specify whether you want to receive short answers (recommended option) or not." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16", + "metadata": {}, + "outputs": [], + "source": [ + "vqa_results = img.analyse_images(\n", + " analysis_type=\"questions\",\n", + " list_of_questions=questions,\n", + " is_concise_answer=True,\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ammico-dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ammico/utils.py b/ammico/utils.py index 39a0ecb..38f4144 100644 --- a/ammico/utils.py +++ b/ammico/utils.py @@ -7,6 +7,9 @@ import collections import random +from enum import Enum + + pkg = importlib_resources.files("ammico") @@ -40,6 +43,12 @@ def ammico_prefetch_models(): res.get() +class AnalysisType(str, Enum): + SUMMARY = "summary" + QUESTIONS = "questions" + SUMMARY_AND_QUESTIONS = "summary_and_questions" + + class AnalysisMethod: """Base class to be inherited by all analysis methods."""