From 4ecdbed2047954e6ad9fc06cb0e8448807f7ece8 Mon Sep 17 00:00:00 2001 From: Ryan Shubert Date: Tue, 21 Apr 2026 19:53:21 +0000 Subject: [PATCH] [Benchmark] Add support for PixmoPoints benchmark --- vlmeval/dataset/__init__.py | 3 +- vlmeval/dataset/pixmopoints.py | 121 +++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+), 1 deletion(-) create mode 100644 vlmeval/dataset/pixmopoints.py diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py index 4d7439c91..35221411c 100644 --- a/vlmeval/dataset/__init__.py +++ b/vlmeval/dataset/__init__.py @@ -103,6 +103,7 @@ from .qbench_video import QBench_Video, QBench_Video_MCQ, QBench_Video_VQA from .reasonmap_plus import ReasonMap_Plus from .refcoco import RefCOCODataset +from .pixmopoints import PixmoPointsDataset from .refspatial import RefSpatialDataset from .refspatialbench import RefSpatialBench from .robospatialbench import RoboSpatialBench @@ -292,7 +293,7 @@ def evaluate(self, eval_file, **judge_kwargs): FoxBench, VTCBench, Asclepius, PlotQA, ChartX, ChartBench, ChartCapDataset, WorldVQA, PuzzleVQA, VisualPuzzles, # noqa: E501 MMSafetyBenchDataset, MSSBenchDataset, SIUODataset, SIUOGenDataset, SIUOMCQDataset, M3oralBenchDataset, # noqa: E501 Design2Code, VLADBench, SSIBenchDataset, NPMM, SGI_Bench_Experimental_Reasoning, MMOral_OPG_OPEN, MMOral_OPG_CLOSED, # noqa: E501 - SciDocBench, + SciDocBench, PixmoPointsDataset ] # add by EASI team diff --git a/vlmeval/dataset/pixmopoints.py b/vlmeval/dataset/pixmopoints.py new file mode 100644 index 000000000..ff37d35f6 --- /dev/null +++ b/vlmeval/dataset/pixmopoints.py @@ -0,0 +1,121 @@ +import json + +import numpy as np +import pandas as pd + +from vlmeval.smp import dump, get_intermediate_file_path, load +from .image_base import ImageBaseDataset +from .utils.spatial_bench.tools.utils import Point2DParser + + +class PixmoPointsDataset(ImageBaseDataset): + """Point localization evaluation using Hungarian matching.""" + + TYPE = 'VQA' + DATASET_URL = {'PixmoPoints': ''} + DATASET_MD5 = {} + + DISTANCE_THRESHOLD = 0.05 # 5% of normalized image size + + PROMPT_SUFFIX = ( + ' Output the point coordinates in JSON format.\n' + 'For example:\n' + '[\n' + ' {"point_2d": [x, y], "label": "point_1"}\n' + ']' + ) + + def build_prompt(self, line): + msgs = super().build_prompt(line) + assert msgs[-1]['type'] == 'text' + msgs[-1]['value'] += self.PROMPT_SUFFIX + return msgs + + def evaluate(self, eval_file, **judge_kwargs): + data = load(eval_file) + if not isinstance(data, pd.DataFrame): + data = pd.DataFrame(data) + data = data.sort_values(by='index') + + meta = self.data.copy() + meta['index'] = meta['index'].astype(str) + meta = meta.set_index('index') + data['index'] = data['index'].astype(str) + + from scipy.optimize import linear_sum_assignment + + details = [] + precision_sum, recall_sum, f1_sum, total = 0, 0, 0, 0 + for _, row in data.iterrows(): + meta_row = meta.loc[row['index']] if row['index'] in meta.index else row + width = int(float(meta_row.get('width', row.get('width', 1)) or 1)) + height = int(float(meta_row.get('height', row.get('height', 1)) or 1)) + + pred_pts = Point2DParser.parse(str(row['prediction']), width, height, output='norm') + gt_pts = self._parse_points(str(meta_row.get('answer', row.get('answer', '')))) + pred_pts = pred_pts.tolist() if pred_pts is not None else [] + + if len(gt_pts) == 0: + precision, recall, f1 = (1.0, 1.0, 1.0) if len(pred_pts) == 0 else (0.0, 1.0, 0.0) + elif len(pred_pts) == 0: + precision, recall, f1 = 0.0, 0.0, 0.0 + else: + pred_arr = np.array(pred_pts) + gt_arr = np.array(gt_pts) + dists = np.linalg.norm(pred_arr[:, None] - gt_arr[None, :], axis=2) + row_ind, col_ind = linear_sum_assignment(dists) + + matches = sum( + dists[i, j] < self.DISTANCE_THRESHOLD + for i, j in zip(row_ind, col_ind) + ) + precision = matches / len(pred_pts) + recall = matches / len(gt_pts) + f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 + + precision_sum += precision + recall_sum += recall + f1_sum += f1 + total += 1 + details.append({ + 'index': row['index'], + 'precision': precision, + 'recall': recall, + 'f1': f1, + }) + + result = { + 'precision': precision_sum / total if total > 0 else 0, + 'recall': recall_sum / total if total > 0 else 0, + 'f1': f1_sum / total if total > 0 else 0, + } + dump(pd.DataFrame(details), get_intermediate_file_path(eval_file, '_detail')) + dump(result, get_intermediate_file_path(eval_file, '_score', 'json')) + return result + + @staticmethod + def _parse_points(s): + try: + pts = json.loads(s) + if not isinstance(pts, list): + return [] + result = [] + for p in pts: + point = None + if isinstance(p, list) and len(p) == 2: + point = p + elif isinstance(p, dict) and 'point_2d' in p and isinstance(p['point_2d'], list) and len(p['point_2d']) == 2: + point = p['point_2d'] + elif isinstance(p, dict) and 'point' in p and isinstance(p['point'], list) and len(p['point']) == 2: + point = p['point'] + if point is None: + continue + try: + x, y = float(point[0]), float(point[1]) + except (TypeError, ValueError): + continue + if 0.0 <= x <= 1.0 and 0.0 <= y <= 1.0: + result.append([x, y]) + return result + except (json.JSONDecodeError, TypeError, KeyError, ValueError): + return []