@article{ART002016376},
author={Bokkeun Sun},
title={A Study of Main Contents Extraction from Web News Pages based on XPath Analysis},
journal={Journal of The Korea Society of Computer and Information},
issn={1598-849X},
year={2015},
volume={20},
number={7},
pages={1-7}
TY - JOUR
AU - Bokkeun Sun
TI - A Study of Main Contents Extraction from Web News Pages based on XPath Analysis
JO - Journal of The Korea Society of Computer and Information
PY - 2015
VL - 20
IS - 7
PB - The Korean Society Of Computer And Information
SP - 1
EP - 7
SN - 1598-849X
AB - Although data on the internet can be used in various fields such as source of data of IR(Information Retrieval), Data mining and knowledge information servece, and contains a lot of unnecessary information. The removal of the unnecessary data is a problem to be solved prior to the study of the knowledge-based information service that is based on the data of the web page, in this paper, we solve the problem through the implementation of XTractor(XPath Extractor). Since XPath is used to navigate the attribute data and the data elements in the XML document, the XPath analysis to be carried out through the XTractor. XTractor Extracts main text by html parsing, XPath grouping and detecting the XPath contains the main data. The result, the recognition and precision rate are showed in 97.9%, 93.9%, except for a few cases in a large amount of experimental data and it was confirmed that it is possible to properly extract the main text of the news.
KW - Main Text Extraction;Web News Page;XPath Grouping;HTML Parsing
DO -
UR -
ER -
Bokkeun Sun. (2015). A Study of Main Contents Extraction from Web News Pages based on XPath Analysis. Journal of The Korea Society of Computer and Information, 20(7), 1-7.
Bokkeun Sun. 2015, "A Study of Main Contents Extraction from Web News Pages based on XPath Analysis", Journal of The Korea Society of Computer and Information, vol.20, no.7 pp.1-7.
Bokkeun Sun "A Study of Main Contents Extraction from Web News Pages based on XPath Analysis" Journal of The Korea Society of Computer and Information 20.7 pp.1-7 (2015) : 1.
Bokkeun Sun. A Study of Main Contents Extraction from Web News Pages based on XPath Analysis. 2015; 20(7), 1-7.
Bokkeun Sun. "A Study of Main Contents Extraction from Web News Pages based on XPath Analysis" Journal of The Korea Society of Computer and Information 20, no.7 (2015) : 1-7.
Bokkeun Sun. A Study of Main Contents Extraction from Web News Pages based on XPath Analysis. Journal of The Korea Society of Computer and Information, 20(7), 1-7.
Bokkeun Sun. A Study of Main Contents Extraction from Web News Pages based on XPath Analysis. Journal of The Korea Society of Computer and Information. 2015; 20(7) 1-7.
Bokkeun Sun. A Study of Main Contents Extraction from Web News Pages based on XPath Analysis. 2015; 20(7), 1-7.
Bokkeun Sun. "A Study of Main Contents Extraction from Web News Pages based on XPath Analysis" Journal of The Korea Society of Computer and Information 20, no.7 (2015) : 1-7.