basic integration into display functionality

2025-10-30 21:46:04 +02:00 · 2025-09-27 16:42:05 +02:00 · 2025-09-27 16:42:05 +02:00 · 402a379f9c
--- a/ammico/display.py
+++ b/ammico/display.py
@ -1,12 +1,14 @@
 import ammico.faces as faces
 import ammico.text as text
 import ammico.colors as colors
 import ammico.image_summary as image_summary
 from ammico.model import MultimodalSummaryModel
 import pandas as pd
 from dash import html, Input, Output, dcc, State, Dash
 from PIL import Image
 import dash_bootstrap_components as dbc
 import warnings
-from typing import Dict, Any, List
+from typing import Dict, Any, List, Optional
 COLOR_SCHEMES = [
@ -97,7 +99,6 @@ class AnalysisExplorer:
            State("left_select_id", "value"),
            State("Dropdown_select_Detector", "value"),
            State("Dropdown_analysis_type", "value"),
            State("checkbox_enable_image_tasks", "value"),
            State("textarea_questions", "value"),
            State("setting_privacy_env_var", "value"),
            State("setting_Emotion_emotion_threshold", "value"),
@ -112,9 +113,15 @@ class AnalysisExplorer:
            Output("settings_TextDetector", "style"),
            Output("settings_EmotionDetector", "style"),
            Output("settings_ColorDetector", "style"),
            Output("settings_VQA", "style"),
            Input("Dropdown_select_Detector", "value"),
        )(self._update_detector_setting)
        self.app.callback(
            Output("textarea_questions", "style"),
            Input("Dropdown_analysis_type", "value"),
        )(self._show_questions_textarea_on_demand)
    # I split the different sections into subfunctions for better clarity
    def _top_file_explorer(self, mydict: dict) -> html.Div:
        """Initialize the file explorer dropdown for selecting the file to be analyzed.
@ -268,8 +275,69 @@ class AnalysisExplorer:
                        )
                    ],
                ),
                # start VQA settings
                html.Div(
                    id="settings_VQA",
                    style={"display": "none"},
                    children=[
                        dbc.Card(
                            [
                                dbc.CardBody(
                                    [
                                        dbc.Row(
                                            dbc.Col(
                                                dcc.Dropdown(
                                                    id="Dropdown_analysis_type",
                                                    options=[
                                                        {"label": v, "value": v}
                                                        for v in SUMMARY_ANALYSIS_TYPE
                                                    ],
-            style={"width": "100%", "display": "inline-block"},
+                                                    value="summary_and_questions",
                                                    clearable=False,
                                                    style={
                                                        "width": "100%",
                                                        "minWidth": "240px",
                                                        "maxWidth": "520px",
                                                    },
                                                ),
                                            ),
                                            justify="start",
                                        ),
                                        html.Div(style={"height": "8px"}),
                                        dbc.Row(
                                            [
                                                dbc.Col(
                                                    dcc.Textarea(
                                                        id="textarea_questions",
                                                        value="Are there people in the image?\nWhat is this picture about?",
                                                        placeholder="One question per line...",
                                                        style={
                                                            "width": "100%",
                                                            "minHeight": "160px",
                                                            "height": "220px",
                                                            "resize": "vertical",
                                                            "overflow": "auto",
                                                        },
                                                        rows=8,
                                                    ),
                                                    width=12,
                                                ),
                                            ],
                                            justify="start",
                                        ),
                                    ]
                                )
                            ],
                            style={
                                "width": "100%",
                                "marginTop": "10px",
                                "zIndex": 2000,
                            },
                        )
                    ],
                ),
            ],
            style={"width": "100%", "display": "inline-block", "overflow": "visible"},
        )
        return settings_layout
@ -289,6 +357,7 @@ class AnalysisExplorer:
                                    "TextDetector",
                                    "EmotionDetector",
                                    "ColorDetector",
                                    "VQA",
                                ],
                                value="TextDetector",
                                id="Dropdown_select_Detector",
@ -296,37 +365,6 @@ class AnalysisExplorer:
                            ),
                            justify="start",
                        ),
                        # NEW: Analysis-type selector (summary/questions/summary_and_questions)
                        dbc.Row(
                            dcc.Dropdown(
                                id="Dropdown_analysis_type",
                                options=[{"label": v, "value": v} for v in SUMMARY_ANALYSIS_TYPE],
                                value="summary_and_questions",
                                style={"width": "60%", "margin-top": "8px"},
                            ),
                            justify="start",
                        ),
                        # NEW: Enable image-level tasks (VQA / caption) checkbox
                        dbc.Row(
                            dcc.Checklist(
                                id="checkbox_enable_image_tasks",
                                options=[{"label": "Enable Image Tasks (Caption / VQA)", "value": "enabled"}],
                                value=["enabled"],  # default enabled
                                inline=True,
                                style={"margin-top": "8px"},
                            ),
                            justify="start",
                        ),
                        # NEW: Questions textarea (newline-separated). Only used if analysis_type includes "questions".
                        dbc.Row(
                            dcc.Textarea(
                                id="textarea_questions",
                                value="Are there people in the image?\nWhat is this picture about?",
                                placeholder="One question per line...",
                                style={"width": "60%", "height": "120px", "margin-top": "8px"},
                            ),
                            justify="start",
                        ),
                        dbc.Row(
                            children=[self._create_setting_layout()],
                            id="div_detector_args",
@ -402,15 +440,22 @@ class AnalysisExplorer:
        }
        if setting_input == "TextDetector":
-            return display_flex, display_none, display_none
+            return display_flex, display_none, display_none, display_none
        if setting_input == "EmotionDetector":
-            return display_none, display_flex, display_none
+            return display_none, display_flex, display_none, display_none
        if setting_input == "ColorDetector":
-            return display_none, display_none, display_flex
+            return display_none, display_none, display_flex, display_none
-
+        if setting_input == "VQA":
            return display_none, display_none, display_none, display_flex
        else:
-            return display_none, display_none, display_none
+            return display_none, display_none, display_none, display_none
    def _parse_questions(self, text: Optional[str]) -> Optional[List[str]]:
        if not text:
            return None
        qs = [q.strip() for q in text.splitlines() if q.strip()]
        return qs if qs else None
    def _right_output_analysis(
        self,
@ -418,8 +463,9 @@ class AnalysisExplorer:
        all_img_options: dict,
        current_img_value: str,
        detector_value: str,
        analysis_type_value: str,
        textarea_questions_value: str,
        setting_privacy_env_var: str,
        checkbox_enable_image_tasks_value: List[str],
        setting_emotion_emotion_threshold: int,
        setting_emotion_race_threshold: int,
        setting_emotion_gender_threshold: int,
@ -439,22 +485,39 @@ class AnalysisExplorer:
            "EmotionDetector": faces.EmotionDetector,
            "TextDetector": text.TextDetector,
            "ColorDetector": colors.ColorDetector,
            "VQA": image_summary.ImageSummaryDetector,
        }
        # Get image ID from dropdown value, which is the filepath
        if current_img_value is None:
            return {}
        image_id = all_img_options[current_img_value]
-        # copy image so prvious runs don't leave their default values in the dict
+        image_copy = self.mydict.get(image_id, {}).copy()
        image_copy = self.mydict[image_id].copy()
        analysis_dict: Dict[str, Any] = {}
        if detector_value == "VQA":
            try:
                qwen_model = MultimodalSummaryModel(
                    model_id="Qwen/Qwen2.5-VL-3B-Instruct"
                )  # TODO: allow user to specify model
                vqa_cls = identify_dict.get("VQA")
                vqa_detector = vqa_cls(qwen_model, subdict={})
                questions_list = self._parse_questions(textarea_questions_value)
                analysis_result = vqa_detector.analyse_image(
                    image_copy,
                    analysis_type=analysis_type_value,
                    list_of_questions=questions_list,
                    is_concise_summary=True,
                    is_concise_answer=True,
                )
                analysis_dict = analysis_result or {}
            except Exception as e:
                warnings.warn(f"VQA/Image tasks failed: {e}")
                analysis_dict = {"image_tasks_error": str(e)}
        else:
            # detector value is the string name of the chosen detector
            identify_function = identify_dict[detector_value]
        identify_function = identify_dict.get(detector_value)
        if identify_function is None:
            detector_class = None
            if detector_value == "TextDetector":
                detector_class = identify_function(
                    image_copy,
@ -484,33 +547,9 @@ class AnalysisExplorer:
            else:
                detector_class = identify_function(image_copy)
        if detector_class is not None:
            analysis_dict = detector_class.analyse_image()
        else:
            analysis_dict = {}
-        image_tasks_result: Dict[str, Any] = {}
+        new_analysis_dict: Dict[str, Any] = {}
        enable_image_tasks = "enabled" in (checkbox_enable_image_tasks_value or [])
        if enable_image_tasks:
            # parse questions textarea: newline separated
            if textarea_questions_value:
                questions_list = [q.strip() for q in textarea_questions_value.splitlines() if q.strip()]
            else:
                questions_list = None
            try:
                image_tasks_result = self.analyse_image(
                    image_copy,
                    analysis_type=analysis_type_value,
                    list_of_questions=questions_list,
                    is_concise_summary=True,
                    is_concise_answer=True,
                )
            except Exception as e:
                warnings.warn(f"Image tasks failed: {e}")
                image_tasks_result = {"image_tasks_error": str(e)}
        # Initialize an empty dictionary
        new_analysis_dict = {}
        # Iterate over the items in the original dictionary
        for k, v in analysis_dict.items():
@ -524,21 +563,15 @@ class AnalysisExplorer:
            # Add the new key-value pair to the new dictionary
            new_analysis_dict[k] = new_value
        if "caption" in image_tasks_result:
            new_analysis_dict["caption"] = image_tasks_result.get("caption", "")
        if "vqa" in image_tasks_result:
            # vqa is expected to be a dict; convert to readable string
            vqa_entries = image_tasks_result["vqa"]
            if isinstance(vqa_entries, dict):
                new_analysis_dict["vqa"] = "; ".join([f"{q}: {a}" for q, a in vqa_entries.items()])
            else:
                new_analysis_dict["vqa"] = str(vqa_entries)
        for err_key in ("caption_error", "vqa_error", "image_tasks_error"):
            if err_key in image_tasks_result:
                new_analysis_dict[err_key] = image_tasks_result[err_key]
        df = pd.DataFrame([new_analysis_dict]).set_index("filename").T
        df.index.rename("filename", inplace=True)
        return dbc.Table.from_dataframe(
            df, striped=True, bordered=True, hover=True, index=True
        )
    def _show_questions_textarea_on_demand(self, analysis_type_value: str) -> dict:
        if analysis_type_value in ("questions", "summary_and_questions"):
            return {"display": "block", "width": "100%"}
        else:
            return {"display": "none"}
--- a/ammico/image_summary.py
+++ b/ammico/image_summary.py
@ -16,7 +16,7 @@ class ImageSummaryDetector(AnalysisMethod):
    def __init__(
        self,
        summary_model: MultimodalSummaryModel,
-        subdict: dict = {},
+        subdict: Optional[Dict[str, Any]] = None,
    ) -> None:
        """
        Class for analysing images using QWEN-2.5-VL model.
@ -29,6 +29,8 @@ class ImageSummaryDetector(AnalysisMethod):
        Returns:
            None.
        """
        if subdict is None:
            subdict = {}
        super().__init__(subdict)
        self.summary_model = summary_model
@ -148,7 +150,50 @@ class ImageSummaryDetector(AnalysisMethod):
        return analysis_type, list_of_questions, is_summary, is_questions
-    def analyse_images(
+    def analyse_image(
        self,
        entry: dict,
        analysis_type: Union[str, AnalysisType] = AnalysisType.SUMMARY_AND_QUESTIONS,
        list_of_questions: Optional[List[str]] = None,
        max_questions_per_image: int = 32,
        is_concise_summary: bool = True,
        is_concise_answer: bool = True,
    ) -> Dict[str, Any]:
        """
        Analyse a single image entry. Returns dict with keys depending on analysis_type:
            - 'caption' (str) if summary requested
            - 'vqa' (dict) if questions requested
        """
        self.subdict = entry
        analysis_type, list_of_questions, is_summary, is_questions = (
            self._validate_analysis_type(
                analysis_type, list_of_questions, max_questions_per_image
            )
        )
        if is_summary:
            try:
                caps = self.generate_caption(
                    entry,
                    num_return_sequences=1,
                    is_concise_summary=is_concise_summary,
                )
                self.subdict["caption"] = caps[0] if caps else ""
            except Exception as e:
                warnings.warn(f"Caption generation failed: {e}")
        if is_questions:
            try:
                vqa_map = self.answer_questions(
                    list_of_questions, entry, is_concise_answer
                )
                self.subdict["vqa"] = vqa_map
            except Exception as e:
                warnings.warn(f"VQA failed: {e}")
        return self.subdict
    def analyse_images_from_dict(
        self,
        analysis_type: Union[AnalysisType, str] = AnalysisType.SUMMARY_AND_QUESTIONS,
        list_of_questions: Optional[List[str]] = None,
@ -191,9 +236,7 @@ class ImageSummaryDetector(AnalysisMethod):
                        )
                        entry["caption"] = caps[0] if caps else ""
                    except Exception as e:
-                        warnings.warn(
+                        warnings.warn(f"Caption generation failed: {e}")
                            "Caption generation failed for key %s: %s", key, e
                        )
                if is_questions:
                    try:
@ -202,7 +245,7 @@ class ImageSummaryDetector(AnalysisMethod):
                        )
                        entry["vqa"] = vqa_map
                    except Exception as e:
-                        warnings.warn("VQA failed for key %s: %s", key, e)
+                        warnings.warn(f"VQA failed: {e}")
                self.subdict[key] = entry
        return self.subdict
@ -251,8 +294,7 @@ class ImageSummaryDetector(AnalysisMethod):
                    )
            except RuntimeError as e:
                warnings.warn(
-                    "Retry without autocast failed: %s. Attempting cudnn-disabled retry.",
+                    f"Retry without autocast failed: {e}. Attempting cudnn-disabled retry."
                    e,
                )
                cudnn_was_enabled = (
                    torch.backends.cudnn.is_available() and torch.backends.cudnn.enabled