@article{ART003212307},
author={Seung-Hyeon Park and Injae Yoo and Byuong-Chan Park and Seok-Yoon Kim and Youngmo Kim},
title={An Enhanced Scene Retrieval Method for OTT Content Based on Multimodal Action and Speech Recognition},
journal={Journal of Software Assessment and Valuation},
issn={2092-8114},
year={2025},
volume={21},
number={2},
pages={61-69}
TY - JOUR
AU - Seung-Hyeon Park
AU - Injae Yoo
AU - Byuong-Chan Park
AU - Seok-Yoon Kim
AU - Youngmo Kim
TI - An Enhanced Scene Retrieval Method for OTT Content Based on Multimodal Action and Speech Recognition
JO - Journal of Software Assessment and Valuation
PY - 2025
VL - 21
IS - 2
PB - Korea Software Assessment and Valuation Society
SP - 61
EP - 69
SN - 2092-8114
AB - With the rapid growth of OTT (Over-the-Top) platforms, the volume of video content has increased exponentially, leading to a rising demand for precise and context-aware scene retrieval technologies. In narrative-driven content such as dramas and films, which often involve complex editing techniques and character-centric storytelling, conventional keyframe-based search methods fall short in capturing semantic continuity and scene context. This paper proposes an advanced method for OTT content scene retrieval based on multimodal action and speech recognition, combining a Transformer-based action recognition model with Speech-to-Text (STT) technology. The proposed approach segments continuous video frames into meaningful action intervals and constructs a de-duplicated scene graph by integrating key objects and their relationships within each segment. Furthermore, speech segments are accurately extracted and temporally aligned with visual data, enabling a unified multimodal representation of scenes. This integration supports more refined and semantically rich scene searches, such as character-centered navigation, emotion-based clip extraction, and dialogue-driven retrieval. The proposed method is expected to significantly enhance the personalization and reusability of OTT content in various user-centered applications.
KW - Multimodal Scene Retrieval;Transformer-based Action Recognition;Speech-Visual Information Integration;OTT Content Analysis;Semantics-Centered Scene Graph
DO -
UR -
ER -
Seung-Hyeon Park, Injae Yoo, Byuong-Chan Park, Seok-Yoon Kim and Youngmo Kim. (2025). An Enhanced Scene Retrieval Method for OTT Content Based on Multimodal Action and Speech Recognition. Journal of Software Assessment and Valuation, 21(2), 61-69.
Seung-Hyeon Park, Injae Yoo, Byuong-Chan Park, Seok-Yoon Kim and Youngmo Kim. 2025, "An Enhanced Scene Retrieval Method for OTT Content Based on Multimodal Action and Speech Recognition", Journal of Software Assessment and Valuation, vol.21, no.2 pp.61-69.
Seung-Hyeon Park, Injae Yoo, Byuong-Chan Park, Seok-Yoon Kim, Youngmo Kim "An Enhanced Scene Retrieval Method for OTT Content Based on Multimodal Action and Speech Recognition" Journal of Software Assessment and Valuation 21.2 pp.61-69 (2025) : 61.
Seung-Hyeon Park, Injae Yoo, Byuong-Chan Park, Seok-Yoon Kim, Youngmo Kim. An Enhanced Scene Retrieval Method for OTT Content Based on Multimodal Action and Speech Recognition. 2025; 21(2), 61-69.
Seung-Hyeon Park, Injae Yoo, Byuong-Chan Park, Seok-Yoon Kim and Youngmo Kim. "An Enhanced Scene Retrieval Method for OTT Content Based on Multimodal Action and Speech Recognition" Journal of Software Assessment and Valuation 21, no.2 (2025) : 61-69.
Seung-Hyeon Park; Injae Yoo; Byuong-Chan Park; Seok-Yoon Kim; Youngmo Kim. An Enhanced Scene Retrieval Method for OTT Content Based on Multimodal Action and Speech Recognition. Journal of Software Assessment and Valuation, 21(2), 61-69.
Seung-Hyeon Park; Injae Yoo; Byuong-Chan Park; Seok-Yoon Kim; Youngmo Kim. An Enhanced Scene Retrieval Method for OTT Content Based on Multimodal Action and Speech Recognition. Journal of Software Assessment and Valuation. 2025; 21(2) 61-69.
Seung-Hyeon Park, Injae Yoo, Byuong-Chan Park, Seok-Yoon Kim, Youngmo Kim. An Enhanced Scene Retrieval Method for OTT Content Based on Multimodal Action and Speech Recognition. 2025; 21(2), 61-69.
Seung-Hyeon Park, Injae Yoo, Byuong-Chan Park, Seok-Yoon Kim and Youngmo Kim. "An Enhanced Scene Retrieval Method for OTT Content Based on Multimodal Action and Speech Recognition" Journal of Software Assessment and Valuation 21, no.2 (2025) : 61-69.