From 402a379f9c819d89b752ebb4c27af261c3f7c097 Mon Sep 17 00:00:00 2001 From: Dmitrii Kapitan Date: Sat, 27 Sep 2025 16:42:05 +0200 Subject: [PATCH] basic integration into display functionality --- ammico/display.py | 251 +++++++++++++++++++++++----------------- ammico/image_summary.py | 58 ++++++++-- 2 files changed, 192 insertions(+), 117 deletions(-) diff --git a/ammico/display.py b/ammico/display.py index ff86a3e..b916dbf 100644 --- a/ammico/display.py +++ b/ammico/display.py @@ -1,12 +1,14 @@ import ammico.faces as faces import ammico.text as text import ammico.colors as colors +import ammico.image_summary as image_summary +from ammico.model import MultimodalSummaryModel import pandas as pd from dash import html, Input, Output, dcc, State, Dash from PIL import Image import dash_bootstrap_components as dbc import warnings -from typing import Dict, Any, List +from typing import Dict, Any, List, Optional COLOR_SCHEMES = [ @@ -97,7 +99,6 @@ class AnalysisExplorer: State("left_select_id", "value"), State("Dropdown_select_Detector", "value"), State("Dropdown_analysis_type", "value"), - State("checkbox_enable_image_tasks", "value"), State("textarea_questions", "value"), State("setting_privacy_env_var", "value"), State("setting_Emotion_emotion_threshold", "value"), @@ -112,9 +113,15 @@ class AnalysisExplorer: Output("settings_TextDetector", "style"), Output("settings_EmotionDetector", "style"), Output("settings_ColorDetector", "style"), + Output("settings_VQA", "style"), Input("Dropdown_select_Detector", "value"), )(self._update_detector_setting) + self.app.callback( + Output("textarea_questions", "style"), + Input("Dropdown_analysis_type", "value"), + )(self._show_questions_textarea_on_demand) + # I split the different sections into subfunctions for better clarity def _top_file_explorer(self, mydict: dict) -> html.Div: """Initialize the file explorer dropdown for selecting the file to be analyzed. @@ -268,8 +275,69 @@ class AnalysisExplorer: ) ], ), + # start VQA settings + html.Div( + id="settings_VQA", + style={"display": "none"}, + children=[ + dbc.Card( + [ + dbc.CardBody( + [ + dbc.Row( + dbc.Col( + dcc.Dropdown( + id="Dropdown_analysis_type", + options=[ + {"label": v, "value": v} + for v in SUMMARY_ANALYSIS_TYPE + ], + value="summary_and_questions", + clearable=False, + style={ + "width": "100%", + "minWidth": "240px", + "maxWidth": "520px", + }, + ), + ), + justify="start", + ), + html.Div(style={"height": "8px"}), + dbc.Row( + [ + dbc.Col( + dcc.Textarea( + id="textarea_questions", + value="Are there people in the image?\nWhat is this picture about?", + placeholder="One question per line...", + style={ + "width": "100%", + "minHeight": "160px", + "height": "220px", + "resize": "vertical", + "overflow": "auto", + }, + rows=8, + ), + width=12, + ), + ], + justify="start", + ), + ] + ) + ], + style={ + "width": "100%", + "marginTop": "10px", + "zIndex": 2000, + }, + ) + ], + ), ], - style={"width": "100%", "display": "inline-block"}, + style={"width": "100%", "display": "inline-block", "overflow": "visible"}, ) return settings_layout @@ -289,6 +357,7 @@ class AnalysisExplorer: "TextDetector", "EmotionDetector", "ColorDetector", + "VQA", ], value="TextDetector", id="Dropdown_select_Detector", @@ -296,37 +365,6 @@ class AnalysisExplorer: ), justify="start", ), - # NEW: Analysis-type selector (summary/questions/summary_and_questions) - dbc.Row( - dcc.Dropdown( - id="Dropdown_analysis_type", - options=[{"label": v, "value": v} for v in SUMMARY_ANALYSIS_TYPE], - value="summary_and_questions", - style={"width": "60%", "margin-top": "8px"}, - ), - justify="start", - ), - # NEW: Enable image-level tasks (VQA / caption) checkbox - dbc.Row( - dcc.Checklist( - id="checkbox_enable_image_tasks", - options=[{"label": "Enable Image Tasks (Caption / VQA)", "value": "enabled"}], - value=["enabled"], # default enabled - inline=True, - style={"margin-top": "8px"}, - ), - justify="start", - ), - # NEW: Questions textarea (newline-separated). Only used if analysis_type includes "questions". - dbc.Row( - dcc.Textarea( - id="textarea_questions", - value="Are there people in the image?\nWhat is this picture about?", - placeholder="One question per line...", - style={"width": "60%", "height": "120px", "margin-top": "8px"}, - ), - justify="start", - ), dbc.Row( children=[self._create_setting_layout()], id="div_detector_args", @@ -402,15 +440,22 @@ class AnalysisExplorer: } if setting_input == "TextDetector": - return display_flex, display_none, display_none + return display_flex, display_none, display_none, display_none if setting_input == "EmotionDetector": - return display_none, display_flex, display_none + return display_none, display_flex, display_none, display_none if setting_input == "ColorDetector": - return display_none, display_none, display_flex - + return display_none, display_none, display_flex, display_none + if setting_input == "VQA": + return display_none, display_none, display_none, display_flex else: - return display_none, display_none, display_none + return display_none, display_none, display_none, display_none + + def _parse_questions(self, text: Optional[str]) -> Optional[List[str]]: + if not text: + return None + qs = [q.strip() for q in text.splitlines() if q.strip()] + return qs if qs else None def _right_output_analysis( self, @@ -418,8 +463,9 @@ class AnalysisExplorer: all_img_options: dict, current_img_value: str, detector_value: str, + analysis_type_value: str, + textarea_questions_value: str, setting_privacy_env_var: str, - checkbox_enable_image_tasks_value: List[str], setting_emotion_emotion_threshold: int, setting_emotion_race_threshold: int, setting_emotion_gender_threshold: int, @@ -439,78 +485,71 @@ class AnalysisExplorer: "EmotionDetector": faces.EmotionDetector, "TextDetector": text.TextDetector, "ColorDetector": colors.ColorDetector, + "VQA": image_summary.ImageSummaryDetector, } # Get image ID from dropdown value, which is the filepath if current_img_value is None: return {} image_id = all_img_options[current_img_value] - # copy image so prvious runs don't leave their default values in the dict - image_copy = self.mydict[image_id].copy() - - # detector value is the string name of the chosen detector - identify_function = identify_dict[detector_value] - - identify_function = identify_dict.get(detector_value) - if identify_function is None: - detector_class = None - - if detector_value == "TextDetector": - detector_class = identify_function( - image_copy, - accept_privacy=( - setting_privacy_env_var - if setting_privacy_env_var - else "PRIVACY_AMMICO" - ), - ) - elif detector_value == "EmotionDetector": - detector_class = identify_function( - image_copy, - emotion_threshold=setting_emotion_emotion_threshold, - race_threshold=setting_emotion_race_threshold, - gender_threshold=setting_emotion_gender_threshold, - accept_disclosure=( - setting_emotion_env_var - if setting_emotion_env_var - else "DISCLOSURE_AMMICO" - ), - ) - elif detector_value == "ColorDetector": - detector_class = identify_function( - image_copy, - delta_e_method=setting_color_delta_e_method, - ) - else: - detector_class = identify_function(image_copy) - - if detector_class is not None: - analysis_dict = detector_class.analyse_image() - else: - analysis_dict = {} - - image_tasks_result: Dict[str, Any] = {} - enable_image_tasks = "enabled" in (checkbox_enable_image_tasks_value or []) - if enable_image_tasks: - # parse questions textarea: newline separated - if textarea_questions_value: - questions_list = [q.strip() for q in textarea_questions_value.splitlines() if q.strip()] - else: - questions_list = None + image_copy = self.mydict.get(image_id, {}).copy() + analysis_dict: Dict[str, Any] = {} + if detector_value == "VQA": try: - image_tasks_result = self.analyse_image( + qwen_model = MultimodalSummaryModel( + model_id="Qwen/Qwen2.5-VL-3B-Instruct" + ) # TODO: allow user to specify model + vqa_cls = identify_dict.get("VQA") + vqa_detector = vqa_cls(qwen_model, subdict={}) + questions_list = self._parse_questions(textarea_questions_value) + analysis_result = vqa_detector.analyse_image( image_copy, analysis_type=analysis_type_value, list_of_questions=questions_list, is_concise_summary=True, is_concise_answer=True, ) + analysis_dict = analysis_result or {} except Exception as e: - warnings.warn(f"Image tasks failed: {e}") - image_tasks_result = {"image_tasks_error": str(e)} - # Initialize an empty dictionary - new_analysis_dict = {} + warnings.warn(f"VQA/Image tasks failed: {e}") + analysis_dict = {"image_tasks_error": str(e)} + else: + # detector value is the string name of the chosen detector + identify_function = identify_dict[detector_value] + + if detector_value == "TextDetector": + detector_class = identify_function( + image_copy, + accept_privacy=( + setting_privacy_env_var + if setting_privacy_env_var + else "PRIVACY_AMMICO" + ), + ) + elif detector_value == "EmotionDetector": + detector_class = identify_function( + image_copy, + emotion_threshold=setting_emotion_emotion_threshold, + race_threshold=setting_emotion_race_threshold, + gender_threshold=setting_emotion_gender_threshold, + accept_disclosure=( + setting_emotion_env_var + if setting_emotion_env_var + else "DISCLOSURE_AMMICO" + ), + ) + elif detector_value == "ColorDetector": + detector_class = identify_function( + image_copy, + delta_e_method=setting_color_delta_e_method, + ) + else: + detector_class = identify_function(image_copy) + + analysis_dict = detector_class.analyse_image() + + new_analysis_dict: Dict[str, Any] = {} # Iterate over the items in the original dictionary for k, v in analysis_dict.items(): @@ -524,21 +563,15 @@ class AnalysisExplorer: # Add the new key-value pair to the new dictionary new_analysis_dict[k] = new_value - if "caption" in image_tasks_result: - new_analysis_dict["caption"] = image_tasks_result.get("caption", "") - if "vqa" in image_tasks_result: - # vqa is expected to be a dict; convert to readable string - vqa_entries = image_tasks_result["vqa"] - if isinstance(vqa_entries, dict): - new_analysis_dict["vqa"] = "; ".join([f"{q}: {a}" for q, a in vqa_entries.items()]) - else: - new_analysis_dict["vqa"] = str(vqa_entries) - for err_key in ("caption_error", "vqa_error", "image_tasks_error"): - if err_key in image_tasks_result: - new_analysis_dict[err_key] = image_tasks_result[err_key] df = pd.DataFrame([new_analysis_dict]).set_index("filename").T df.index.rename("filename", inplace=True) return dbc.Table.from_dataframe( df, striped=True, bordered=True, hover=True, index=True ) + + def _show_questions_textarea_on_demand(self, analysis_type_value: str) -> dict: + if analysis_type_value in ("questions", "summary_and_questions"): + return {"display": "block", "width": "100%"} + else: + return {"display": "none"} diff --git a/ammico/image_summary.py b/ammico/image_summary.py index 3ccc3f4..203ef21 100644 --- a/ammico/image_summary.py +++ b/ammico/image_summary.py @@ -16,7 +16,7 @@ class ImageSummaryDetector(AnalysisMethod): def __init__( self, summary_model: MultimodalSummaryModel, - subdict: dict = {}, + subdict: Optional[Dict[str, Any]] = None, ) -> None: """ Class for analysing images using QWEN-2.5-VL model. @@ -29,6 +29,8 @@ class ImageSummaryDetector(AnalysisMethod): Returns: None. """ + if subdict is None: + subdict = {} super().__init__(subdict) self.summary_model = summary_model @@ -148,7 +150,50 @@ class ImageSummaryDetector(AnalysisMethod): return analysis_type, list_of_questions, is_summary, is_questions - def analyse_images( + def analyse_image( + self, + entry: dict, + analysis_type: Union[str, AnalysisType] = AnalysisType.SUMMARY_AND_QUESTIONS, + list_of_questions: Optional[List[str]] = None, + max_questions_per_image: int = 32, + is_concise_summary: bool = True, + is_concise_answer: bool = True, + ) -> Dict[str, Any]: + """ + Analyse a single image entry. Returns dict with keys depending on analysis_type: + - 'caption' (str) if summary requested + - 'vqa' (dict) if questions requested + """ + self.subdict = entry + analysis_type, list_of_questions, is_summary, is_questions = ( + self._validate_analysis_type( + analysis_type, list_of_questions, max_questions_per_image + ) + ) + + if is_summary: + try: + caps = self.generate_caption( + entry, + num_return_sequences=1, + is_concise_summary=is_concise_summary, + ) + self.subdict["caption"] = caps[0] if caps else "" + except Exception as e: + warnings.warn(f"Caption generation failed: {e}") + + if is_questions: + try: + vqa_map = self.answer_questions( + list_of_questions, entry, is_concise_answer + ) + self.subdict["vqa"] = vqa_map + except Exception as e: + warnings.warn(f"VQA failed: {e}") + + return self.subdict + + def analyse_images_from_dict( self, analysis_type: Union[AnalysisType, str] = AnalysisType.SUMMARY_AND_QUESTIONS, list_of_questions: Optional[List[str]] = None, @@ -191,9 +236,7 @@ class ImageSummaryDetector(AnalysisMethod): ) entry["caption"] = caps[0] if caps else "" except Exception as e: - warnings.warn( - "Caption generation failed for key %s: %s", key, e - ) + warnings.warn(f"Caption generation failed: {e}") if is_questions: try: @@ -202,7 +245,7 @@ class ImageSummaryDetector(AnalysisMethod): ) entry["vqa"] = vqa_map except Exception as e: - warnings.warn("VQA failed for key %s: %s", key, e) + warnings.warn(f"VQA failed: {e}") self.subdict[key] = entry return self.subdict @@ -251,8 +294,7 @@ class ImageSummaryDetector(AnalysisMethod): ) except RuntimeError as e: warnings.warn( - "Retry without autocast failed: %s. Attempting cudnn-disabled retry.", - e, + f"Retry without autocast failed: {e}. Attempting cudnn-disabled retry." ) cudnn_was_enabled = ( torch.backends.cudnn.is_available() and torch.backends.cudnn.enabled