@article{ART002891736},
author={Jin-Wang Mok and Jang Hyun Jae and Hyun-Seob Lee},
title={HTML Tag Depth Embedding: An Input Embedding Method of the BERT Model for Improving Web Document Reading Comprehension Performance},
journal={Journal of Internet of Things and Convergence},
issn={2466-0078},
year={2022},
volume={8},
number={5},
pages={17-25},
doi={10.20465/KIOTS.2022.8.5.017}
TY - JOUR
AU - Jin-Wang Mok
AU - Jang Hyun Jae
AU - Hyun-Seob Lee
TI - HTML Tag Depth Embedding: An Input Embedding Method of the BERT Model for Improving Web Document Reading Comprehension Performance
JO - Journal of Internet of Things and Convergence
PY - 2022
VL - 8
IS - 5
PB - The Korea Internet of Things Society
SP - 17
EP - 25
SN - 2466-0078
AB - Recently the massive amount of data has been generated because of the number of edge devices increases. And especially, the number of raw unstructured HTML documents has been increased.
Therefore, MRC(Machine Reading Comprehension) in which a natural language processing model finds the important information within an HTML document is becoming more important. In this paper, we propose HTDE(HTML Tag Depth Embedding Method), which allows the BERT to train the depth of the HTML document structure. HTDE makes a tag stack from the HTML document for each input token in the BERT and then extracts the depth information. After that, we add a HTML embedding layer that takes the depth of the token as input to the step of input embedding of BERT. Since tokenization using HTDE identifies the HTML document structures through the relationship of surrounding tokens, HTDE improves the accuracy of BERT for HTML documents. Finally, we demonstrated that the proposed idea showing the higher accuracy compared than the accuracy using the conventional embedding of BERT.
KW - Natural Language Processing;Machine Reading Comprehension;Embeddings;HTML;BERT
DO - 10.20465/KIOTS.2022.8.5.017
ER -
Jin-Wang Mok, Jang Hyun Jae and Hyun-Seob Lee. (2022). HTML Tag Depth Embedding: An Input Embedding Method of the BERT Model for Improving Web Document Reading Comprehension Performance. Journal of Internet of Things and Convergence, 8(5), 17-25.
Jin-Wang Mok, Jang Hyun Jae and Hyun-Seob Lee. 2022, "HTML Tag Depth Embedding: An Input Embedding Method of the BERT Model for Improving Web Document Reading Comprehension Performance", Journal of Internet of Things and Convergence, vol.8, no.5 pp.17-25. Available from: doi:10.20465/KIOTS.2022.8.5.017
Jin-Wang Mok, Jang Hyun Jae, Hyun-Seob Lee "HTML Tag Depth Embedding: An Input Embedding Method of the BERT Model for Improving Web Document Reading Comprehension Performance" Journal of Internet of Things and Convergence 8.5 pp.17-25 (2022) : 17.
Jin-Wang Mok, Jang Hyun Jae, Hyun-Seob Lee. HTML Tag Depth Embedding: An Input Embedding Method of the BERT Model for Improving Web Document Reading Comprehension Performance. 2022; 8(5), 17-25. Available from: doi:10.20465/KIOTS.2022.8.5.017
Jin-Wang Mok, Jang Hyun Jae and Hyun-Seob Lee. "HTML Tag Depth Embedding: An Input Embedding Method of the BERT Model for Improving Web Document Reading Comprehension Performance" Journal of Internet of Things and Convergence 8, no.5 (2022) : 17-25.doi: 10.20465/KIOTS.2022.8.5.017
Jin-Wang Mok; Jang Hyun Jae; Hyun-Seob Lee. HTML Tag Depth Embedding: An Input Embedding Method of the BERT Model for Improving Web Document Reading Comprehension Performance. Journal of Internet of Things and Convergence, 8(5), 17-25. doi: 10.20465/KIOTS.2022.8.5.017
Jin-Wang Mok; Jang Hyun Jae; Hyun-Seob Lee. HTML Tag Depth Embedding: An Input Embedding Method of the BERT Model for Improving Web Document Reading Comprehension Performance. Journal of Internet of Things and Convergence. 2022; 8(5) 17-25. doi: 10.20465/KIOTS.2022.8.5.017
Jin-Wang Mok, Jang Hyun Jae, Hyun-Seob Lee. HTML Tag Depth Embedding: An Input Embedding Method of the BERT Model for Improving Web Document Reading Comprehension Performance. 2022; 8(5), 17-25. Available from: doi:10.20465/KIOTS.2022.8.5.017
Jin-Wang Mok, Jang Hyun Jae and Hyun-Seob Lee. "HTML Tag Depth Embedding: An Input Embedding Method of the BERT Model for Improving Web Document Reading Comprehension Performance" Journal of Internet of Things and Convergence 8, no.5 (2022) : 17-25.doi: 10.20465/KIOTS.2022.8.5.017