@article{ART003280299},
author={Soon-Young Lee and Song Min-Geon and Soosang Lee},
title={Embedding Model-Based Approach to Duplicate Verification in MARC Records},
journal={Journal of Korean Library and Information Science Society},
issn={2466-2542},
year={2025},
volume={56},
number={4},
pages={1-20}
TY - JOUR
AU - Soon-Young Lee
AU - Song Min-Geon
AU - Soosang Lee
TI - Embedding Model-Based Approach to Duplicate Verification in MARC Records
JO - Journal of Korean Library and Information Science Society
PY - 2025
VL - 56
IS - 4
PB - Korean Library And Information Science Society
SP - 1
EP - 20
SN - 2466-2542
AB - This study aimed to improve the performance of duplicate verification algorithms for MARC records by applying AI technology. To overcome the limitations of existing rule-based algorithms, we utilized AI embedding models based on semantic similarity of text to vectorize MARC records and verify duplicate records through similarity search and semantic similarity analysis. The specific research methodology consisted of two phases. First, we implemented a duplicate verification algorithm for MARC records based on vector similarity search using embedding models and evaluated its performance using the same dataset as the prior study. Second, reflecting on the evaluation results of the initial experiment, we implemented an algorithm that maximizes the advantages of the embedding approach—specifically, identifying duplicate records caused by variations in string notation. For this purpose, we evaluated the algorithm’s performance using newly constructed experimental data and evaluation metrics. The experimental dataset was designed to reflect notational variations that may occur in actual library settings, applying eight transformation rules. The results of the first experiment showed that the rate of correctly identifying identical groups as duplicates improved compared to the prior study. However, the embedding approach revealed limitations in areas requiring precise matching of numbers and special characters, such as incorrectly judging multi-volume materials with different volume information as similar. The results of the second experiment, designed to validate the advantages of the embedding approach, demonstrated 100% identification of both duplicate records and transformation rules across the entire experimental dataset.
KW - AI;Embedding Models;Vector Similarity Search;MARC Records;Duplicate Verification
DO -
UR -
ER -
Soon-Young Lee, Song Min-Geon and Soosang Lee. (2025). Embedding Model-Based Approach to Duplicate Verification in MARC Records. Journal of Korean Library and Information Science Society, 56(4), 1-20.
Soon-Young Lee, Song Min-Geon and Soosang Lee. 2025, "Embedding Model-Based Approach to Duplicate Verification in MARC Records", Journal of Korean Library and Information Science Society, vol.56, no.4 pp.1-20.
Soon-Young Lee, Song Min-Geon, Soosang Lee "Embedding Model-Based Approach to Duplicate Verification in MARC Records" Journal of Korean Library and Information Science Society 56.4 pp.1-20 (2025) : 1.
Soon-Young Lee, Song Min-Geon, Soosang Lee. Embedding Model-Based Approach to Duplicate Verification in MARC Records. 2025; 56(4), 1-20.
Soon-Young Lee, Song Min-Geon and Soosang Lee. "Embedding Model-Based Approach to Duplicate Verification in MARC Records" Journal of Korean Library and Information Science Society 56, no.4 (2025) : 1-20.
Soon-Young Lee; Song Min-Geon; Soosang Lee. Embedding Model-Based Approach to Duplicate Verification in MARC Records. Journal of Korean Library and Information Science Society, 56(4), 1-20.
Soon-Young Lee; Song Min-Geon; Soosang Lee. Embedding Model-Based Approach to Duplicate Verification in MARC Records. Journal of Korean Library and Information Science Society. 2025; 56(4) 1-20.
Soon-Young Lee, Song Min-Geon, Soosang Lee. Embedding Model-Based Approach to Duplicate Verification in MARC Records. 2025; 56(4), 1-20.
Soon-Young Lee, Song Min-Geon and Soosang Lee. "Embedding Model-Based Approach to Duplicate Verification in MARC Records" Journal of Korean Library and Information Science Society 56, no.4 (2025) : 1-20.