basic integration into display functionality

Этот коммит содержится в:
Dmitrii Kapitan 2025-09-27 16:42:05 +02:00 коммит произвёл DimasfromLavoisier
родитель 5c7e2c3f64
Коммит 402a379f9c
2 изменённых файлов: 192 добавлений и 117 удалений

Просмотреть файл

@ -1,12 +1,14 @@
import ammico.faces as faces import ammico.faces as faces
import ammico.text as text import ammico.text as text
import ammico.colors as colors import ammico.colors as colors
import ammico.image_summary as image_summary
from ammico.model import MultimodalSummaryModel
import pandas as pd import pandas as pd
from dash import html, Input, Output, dcc, State, Dash from dash import html, Input, Output, dcc, State, Dash
from PIL import Image from PIL import Image
import dash_bootstrap_components as dbc import dash_bootstrap_components as dbc
import warnings import warnings
from typing import Dict, Any, List from typing import Dict, Any, List, Optional
COLOR_SCHEMES = [ COLOR_SCHEMES = [
@ -97,7 +99,6 @@ class AnalysisExplorer:
State("left_select_id", "value"), State("left_select_id", "value"),
State("Dropdown_select_Detector", "value"), State("Dropdown_select_Detector", "value"),
State("Dropdown_analysis_type", "value"), State("Dropdown_analysis_type", "value"),
State("checkbox_enable_image_tasks", "value"),
State("textarea_questions", "value"), State("textarea_questions", "value"),
State("setting_privacy_env_var", "value"), State("setting_privacy_env_var", "value"),
State("setting_Emotion_emotion_threshold", "value"), State("setting_Emotion_emotion_threshold", "value"),
@ -112,9 +113,15 @@ class AnalysisExplorer:
Output("settings_TextDetector", "style"), Output("settings_TextDetector", "style"),
Output("settings_EmotionDetector", "style"), Output("settings_EmotionDetector", "style"),
Output("settings_ColorDetector", "style"), Output("settings_ColorDetector", "style"),
Output("settings_VQA", "style"),
Input("Dropdown_select_Detector", "value"), Input("Dropdown_select_Detector", "value"),
)(self._update_detector_setting) )(self._update_detector_setting)
self.app.callback(
Output("textarea_questions", "style"),
Input("Dropdown_analysis_type", "value"),
)(self._show_questions_textarea_on_demand)
# I split the different sections into subfunctions for better clarity # I split the different sections into subfunctions for better clarity
def _top_file_explorer(self, mydict: dict) -> html.Div: def _top_file_explorer(self, mydict: dict) -> html.Div:
"""Initialize the file explorer dropdown for selecting the file to be analyzed. """Initialize the file explorer dropdown for selecting the file to be analyzed.
@ -268,8 +275,69 @@ class AnalysisExplorer:
) )
], ],
), ),
# start VQA settings
html.Div(
id="settings_VQA",
style={"display": "none"},
children=[
dbc.Card(
[
dbc.CardBody(
[
dbc.Row(
dbc.Col(
dcc.Dropdown(
id="Dropdown_analysis_type",
options=[
{"label": v, "value": v}
for v in SUMMARY_ANALYSIS_TYPE
], ],
style={"width": "100%", "display": "inline-block"}, value="summary_and_questions",
clearable=False,
style={
"width": "100%",
"minWidth": "240px",
"maxWidth": "520px",
},
),
),
justify="start",
),
html.Div(style={"height": "8px"}),
dbc.Row(
[
dbc.Col(
dcc.Textarea(
id="textarea_questions",
value="Are there people in the image?\nWhat is this picture about?",
placeholder="One question per line...",
style={
"width": "100%",
"minHeight": "160px",
"height": "220px",
"resize": "vertical",
"overflow": "auto",
},
rows=8,
),
width=12,
),
],
justify="start",
),
]
)
],
style={
"width": "100%",
"marginTop": "10px",
"zIndex": 2000,
},
)
],
),
],
style={"width": "100%", "display": "inline-block", "overflow": "visible"},
) )
return settings_layout return settings_layout
@ -289,6 +357,7 @@ class AnalysisExplorer:
"TextDetector", "TextDetector",
"EmotionDetector", "EmotionDetector",
"ColorDetector", "ColorDetector",
"VQA",
], ],
value="TextDetector", value="TextDetector",
id="Dropdown_select_Detector", id="Dropdown_select_Detector",
@ -296,37 +365,6 @@ class AnalysisExplorer:
), ),
justify="start", justify="start",
), ),
# NEW: Analysis-type selector (summary/questions/summary_and_questions)
dbc.Row(
dcc.Dropdown(
id="Dropdown_analysis_type",
options=[{"label": v, "value": v} for v in SUMMARY_ANALYSIS_TYPE],
value="summary_and_questions",
style={"width": "60%", "margin-top": "8px"},
),
justify="start",
),
# NEW: Enable image-level tasks (VQA / caption) checkbox
dbc.Row(
dcc.Checklist(
id="checkbox_enable_image_tasks",
options=[{"label": "Enable Image Tasks (Caption / VQA)", "value": "enabled"}],
value=["enabled"], # default enabled
inline=True,
style={"margin-top": "8px"},
),
justify="start",
),
# NEW: Questions textarea (newline-separated). Only used if analysis_type includes "questions".
dbc.Row(
dcc.Textarea(
id="textarea_questions",
value="Are there people in the image?\nWhat is this picture about?",
placeholder="One question per line...",
style={"width": "60%", "height": "120px", "margin-top": "8px"},
),
justify="start",
),
dbc.Row( dbc.Row(
children=[self._create_setting_layout()], children=[self._create_setting_layout()],
id="div_detector_args", id="div_detector_args",
@ -402,15 +440,22 @@ class AnalysisExplorer:
} }
if setting_input == "TextDetector": if setting_input == "TextDetector":
return display_flex, display_none, display_none return display_flex, display_none, display_none, display_none
if setting_input == "EmotionDetector": if setting_input == "EmotionDetector":
return display_none, display_flex, display_none return display_none, display_flex, display_none, display_none
if setting_input == "ColorDetector": if setting_input == "ColorDetector":
return display_none, display_none, display_flex return display_none, display_none, display_flex, display_none
if setting_input == "VQA":
return display_none, display_none, display_none, display_flex
else: else:
return display_none, display_none, display_none return display_none, display_none, display_none, display_none
def _parse_questions(self, text: Optional[str]) -> Optional[List[str]]:
if not text:
return None
qs = [q.strip() for q in text.splitlines() if q.strip()]
return qs if qs else None
def _right_output_analysis( def _right_output_analysis(
self, self,
@ -418,8 +463,9 @@ class AnalysisExplorer:
all_img_options: dict, all_img_options: dict,
current_img_value: str, current_img_value: str,
detector_value: str, detector_value: str,
analysis_type_value: str,
textarea_questions_value: str,
setting_privacy_env_var: str, setting_privacy_env_var: str,
checkbox_enable_image_tasks_value: List[str],
setting_emotion_emotion_threshold: int, setting_emotion_emotion_threshold: int,
setting_emotion_race_threshold: int, setting_emotion_race_threshold: int,
setting_emotion_gender_threshold: int, setting_emotion_gender_threshold: int,
@ -439,22 +485,39 @@ class AnalysisExplorer:
"EmotionDetector": faces.EmotionDetector, "EmotionDetector": faces.EmotionDetector,
"TextDetector": text.TextDetector, "TextDetector": text.TextDetector,
"ColorDetector": colors.ColorDetector, "ColorDetector": colors.ColorDetector,
"VQA": image_summary.ImageSummaryDetector,
} }
# Get image ID from dropdown value, which is the filepath # Get image ID from dropdown value, which is the filepath
if current_img_value is None: if current_img_value is None:
return {} return {}
image_id = all_img_options[current_img_value] image_id = all_img_options[current_img_value]
# copy image so prvious runs don't leave their default values in the dict image_copy = self.mydict.get(image_id, {}).copy()
image_copy = self.mydict[image_id].copy()
analysis_dict: Dict[str, Any] = {}
if detector_value == "VQA":
try:
qwen_model = MultimodalSummaryModel(
model_id="Qwen/Qwen2.5-VL-3B-Instruct"
) # TODO: allow user to specify model
vqa_cls = identify_dict.get("VQA")
vqa_detector = vqa_cls(qwen_model, subdict={})
questions_list = self._parse_questions(textarea_questions_value)
analysis_result = vqa_detector.analyse_image(
image_copy,
analysis_type=analysis_type_value,
list_of_questions=questions_list,
is_concise_summary=True,
is_concise_answer=True,
)
analysis_dict = analysis_result or {}
except Exception as e:
warnings.warn(f"VQA/Image tasks failed: {e}")
analysis_dict = {"image_tasks_error": str(e)}
else:
# detector value is the string name of the chosen detector # detector value is the string name of the chosen detector
identify_function = identify_dict[detector_value] identify_function = identify_dict[detector_value]
identify_function = identify_dict.get(detector_value)
if identify_function is None:
detector_class = None
if detector_value == "TextDetector": if detector_value == "TextDetector":
detector_class = identify_function( detector_class = identify_function(
image_copy, image_copy,
@ -484,33 +547,9 @@ class AnalysisExplorer:
else: else:
detector_class = identify_function(image_copy) detector_class = identify_function(image_copy)
if detector_class is not None:
analysis_dict = detector_class.analyse_image() analysis_dict = detector_class.analyse_image()
else:
analysis_dict = {}
image_tasks_result: Dict[str, Any] = {} new_analysis_dict: Dict[str, Any] = {}
enable_image_tasks = "enabled" in (checkbox_enable_image_tasks_value or [])
if enable_image_tasks:
# parse questions textarea: newline separated
if textarea_questions_value:
questions_list = [q.strip() for q in textarea_questions_value.splitlines() if q.strip()]
else:
questions_list = None
try:
image_tasks_result = self.analyse_image(
image_copy,
analysis_type=analysis_type_value,
list_of_questions=questions_list,
is_concise_summary=True,
is_concise_answer=True,
)
except Exception as e:
warnings.warn(f"Image tasks failed: {e}")
image_tasks_result = {"image_tasks_error": str(e)}
# Initialize an empty dictionary
new_analysis_dict = {}
# Iterate over the items in the original dictionary # Iterate over the items in the original dictionary
for k, v in analysis_dict.items(): for k, v in analysis_dict.items():
@ -524,21 +563,15 @@ class AnalysisExplorer:
# Add the new key-value pair to the new dictionary # Add the new key-value pair to the new dictionary
new_analysis_dict[k] = new_value new_analysis_dict[k] = new_value
if "caption" in image_tasks_result:
new_analysis_dict["caption"] = image_tasks_result.get("caption", "")
if "vqa" in image_tasks_result:
# vqa is expected to be a dict; convert to readable string
vqa_entries = image_tasks_result["vqa"]
if isinstance(vqa_entries, dict):
new_analysis_dict["vqa"] = "; ".join([f"{q}: {a}" for q, a in vqa_entries.items()])
else:
new_analysis_dict["vqa"] = str(vqa_entries)
for err_key in ("caption_error", "vqa_error", "image_tasks_error"):
if err_key in image_tasks_result:
new_analysis_dict[err_key] = image_tasks_result[err_key]
df = pd.DataFrame([new_analysis_dict]).set_index("filename").T df = pd.DataFrame([new_analysis_dict]).set_index("filename").T
df.index.rename("filename", inplace=True) df.index.rename("filename", inplace=True)
return dbc.Table.from_dataframe( return dbc.Table.from_dataframe(
df, striped=True, bordered=True, hover=True, index=True df, striped=True, bordered=True, hover=True, index=True
) )
def _show_questions_textarea_on_demand(self, analysis_type_value: str) -> dict:
if analysis_type_value in ("questions", "summary_and_questions"):
return {"display": "block", "width": "100%"}
else:
return {"display": "none"}

Просмотреть файл

@ -16,7 +16,7 @@ class ImageSummaryDetector(AnalysisMethod):
def __init__( def __init__(
self, self,
summary_model: MultimodalSummaryModel, summary_model: MultimodalSummaryModel,
subdict: dict = {}, subdict: Optional[Dict[str, Any]] = None,
) -> None: ) -> None:
""" """
Class for analysing images using QWEN-2.5-VL model. Class for analysing images using QWEN-2.5-VL model.
@ -29,6 +29,8 @@ class ImageSummaryDetector(AnalysisMethod):
Returns: Returns:
None. None.
""" """
if subdict is None:
subdict = {}
super().__init__(subdict) super().__init__(subdict)
self.summary_model = summary_model self.summary_model = summary_model
@ -148,7 +150,50 @@ class ImageSummaryDetector(AnalysisMethod):
return analysis_type, list_of_questions, is_summary, is_questions return analysis_type, list_of_questions, is_summary, is_questions
def analyse_images( def analyse_image(
self,
entry: dict,
analysis_type: Union[str, AnalysisType] = AnalysisType.SUMMARY_AND_QUESTIONS,
list_of_questions: Optional[List[str]] = None,
max_questions_per_image: int = 32,
is_concise_summary: bool = True,
is_concise_answer: bool = True,
) -> Dict[str, Any]:
"""
Analyse a single image entry. Returns dict with keys depending on analysis_type:
- 'caption' (str) if summary requested
- 'vqa' (dict) if questions requested
"""
self.subdict = entry
analysis_type, list_of_questions, is_summary, is_questions = (
self._validate_analysis_type(
analysis_type, list_of_questions, max_questions_per_image
)
)
if is_summary:
try:
caps = self.generate_caption(
entry,
num_return_sequences=1,
is_concise_summary=is_concise_summary,
)
self.subdict["caption"] = caps[0] if caps else ""
except Exception as e:
warnings.warn(f"Caption generation failed: {e}")
if is_questions:
try:
vqa_map = self.answer_questions(
list_of_questions, entry, is_concise_answer
)
self.subdict["vqa"] = vqa_map
except Exception as e:
warnings.warn(f"VQA failed: {e}")
return self.subdict
def analyse_images_from_dict(
self, self,
analysis_type: Union[AnalysisType, str] = AnalysisType.SUMMARY_AND_QUESTIONS, analysis_type: Union[AnalysisType, str] = AnalysisType.SUMMARY_AND_QUESTIONS,
list_of_questions: Optional[List[str]] = None, list_of_questions: Optional[List[str]] = None,
@ -191,9 +236,7 @@ class ImageSummaryDetector(AnalysisMethod):
) )
entry["caption"] = caps[0] if caps else "" entry["caption"] = caps[0] if caps else ""
except Exception as e: except Exception as e:
warnings.warn( warnings.warn(f"Caption generation failed: {e}")
"Caption generation failed for key %s: %s", key, e
)
if is_questions: if is_questions:
try: try:
@ -202,7 +245,7 @@ class ImageSummaryDetector(AnalysisMethod):
) )
entry["vqa"] = vqa_map entry["vqa"] = vqa_map
except Exception as e: except Exception as e:
warnings.warn("VQA failed for key %s: %s", key, e) warnings.warn(f"VQA failed: {e}")
self.subdict[key] = entry self.subdict[key] = entry
return self.subdict return self.subdict
@ -251,8 +294,7 @@ class ImageSummaryDetector(AnalysisMethod):
) )
except RuntimeError as e: except RuntimeError as e:
warnings.warn( warnings.warn(
"Retry without autocast failed: %s. Attempting cudnn-disabled retry.", f"Retry without autocast failed: {e}. Attempting cudnn-disabled retry."
e,
) )
cudnn_was_enabled = ( cudnn_was_enabled = (
torch.backends.cudnn.is_available() and torch.backends.cudnn.enabled torch.backends.cudnn.is_available() and torch.backends.cudnn.enabled