@article{ART003338894},
author={DongJu Jang and Yeong-In Lee and Ha-Young Kim},
title={A Simulation-Based VQA Dataset for Evaluating Intrinsic Physical Property Inference Capabilities of Vision-Language Models},
journal={Journal of The Korea Society of Computer and Information},
issn={1598-849X},
year={2026},
volume={31},
number={5},
pages={95-104}
TY - JOUR
AU - DongJu Jang
AU - Yeong-In Lee
AU - Ha-Young Kim
TI - A Simulation-Based VQA Dataset for Evaluating Intrinsic Physical Property Inference Capabilities of Vision-Language Models
JO - Journal of The Korea Society of Computer and Information
PY - 2026
VL - 31
IS - 5
PB - The Korean Society Of Computer And Information
SP - 95
EP - 104
SN - 1598-849X
AB - This study proposes a multi-view video question-answering benchmark and dataset to train and evaluate vision-language models on inferring intrinsic physical properties, such as mass and elasticity, through robot-object interactions beyond simple scene recognition. To this end, we collected data by designing cube pushing and sphere dropping tasks based on inverse kinematics control within a simulation environment, and analyzed the performance by fine-tuning state-of-the-art models. The experimental results demonstrated that although pre-trained models showed low accuracy, their performance improved significantly in the mass inference task where the final displacement remains static, successfully overcoming existing text response biases after fine-tuning. Conversely, in the elasticity inference task, which requires tracking a momentary dynamic trajectory, the performance improvement was limited and the models exhibited a limitation of regressing to linguistic biases. In conclusion, this dataset provides an environment to quantitatively evaluate the physical reasoning capabilities of the models, contributing to laying the foundation for efficient action planning and decision-making in real-world robots in the future.
KW - Vision-Language Model;Interactive Perception;Simulation;Robot Dataset;Physical property
DO -
UR -
ER -
DongJu Jang, Yeong-In Lee and Ha-Young Kim. (2026). A Simulation-Based VQA Dataset for Evaluating Intrinsic Physical Property Inference Capabilities of Vision-Language Models. Journal of The Korea Society of Computer and Information, 31(5), 95-104.
DongJu Jang, Yeong-In Lee and Ha-Young Kim. 2026, "A Simulation-Based VQA Dataset for Evaluating Intrinsic Physical Property Inference Capabilities of Vision-Language Models", Journal of The Korea Society of Computer and Information, vol.31, no.5 pp.95-104.
DongJu Jang, Yeong-In Lee, Ha-Young Kim "A Simulation-Based VQA Dataset for Evaluating Intrinsic Physical Property Inference Capabilities of Vision-Language Models" Journal of The Korea Society of Computer and Information 31.5 pp.95-104 (2026) : 95.
DongJu Jang, Yeong-In Lee, Ha-Young Kim. A Simulation-Based VQA Dataset for Evaluating Intrinsic Physical Property Inference Capabilities of Vision-Language Models. 2026; 31(5), 95-104.
DongJu Jang, Yeong-In Lee and Ha-Young Kim. "A Simulation-Based VQA Dataset for Evaluating Intrinsic Physical Property Inference Capabilities of Vision-Language Models" Journal of The Korea Society of Computer and Information 31, no.5 (2026) : 95-104.
DongJu Jang; Yeong-In Lee; Ha-Young Kim. A Simulation-Based VQA Dataset for Evaluating Intrinsic Physical Property Inference Capabilities of Vision-Language Models. Journal of The Korea Society of Computer and Information, 31(5), 95-104.
DongJu Jang; Yeong-In Lee; Ha-Young Kim. A Simulation-Based VQA Dataset for Evaluating Intrinsic Physical Property Inference Capabilities of Vision-Language Models. Journal of The Korea Society of Computer and Information. 2026; 31(5) 95-104.
DongJu Jang, Yeong-In Lee, Ha-Young Kim. A Simulation-Based VQA Dataset for Evaluating Intrinsic Physical Property Inference Capabilities of Vision-Language Models. 2026; 31(5), 95-104.
DongJu Jang, Yeong-In Lee and Ha-Young Kim. "A Simulation-Based VQA Dataset for Evaluating Intrinsic Physical Property Inference Capabilities of Vision-Language Models" Journal of The Korea Society of Computer and Information 31, no.5 (2026) : 95-104.