@article{ART002198195},
author={SAJOON PARK and Jae Ho Kim},
title={Paragraph-based K-Means Clustering by using Meaning-based Paragraph Division},
journal={Journal of Knowledge Information Technology and Systems},
issn={1975-7700},
year={2017},
volume={12},
number={1},
pages={157-164},
doi={10.34163/jkits.2017.12.1.014}
TY - JOUR
AU - SAJOON PARK
AU - Jae Ho Kim
TI - Paragraph-based K-Means Clustering by using Meaning-based Paragraph Division
JO - Journal of Knowledge Information Technology and Systems
PY - 2017
VL - 12
IS - 1
PB - Korea Knowledge Information Technology Society
SP - 157
EP - 164
SN - 1975-7700
AB - As the number of electronic documents explosively increases, it becomes more and more difficult to retrieve information from them rapidly and accurately. To solve this problem, documents are clustered in various ways and generally K-Means algorithm is used to achieve it. K-Means algorithm is adequate to cluster so many documents rapidly and easily, but it does not consider the meaning of documents on clustering. In this research, we propose a document clustering technique of using meaning-based paragraphs. The proposed technique divides documents in a document set into meaning-based paragraphs by measuring similarity between sentences, chooses representative paragraphs having the maximum coherence value from each document, and then commits K-Means algorithm depending on them. In this paper, different from existing methods, we proposed a novel similarity function between two adjacent sentences by using WordNet as a ontology to calculate the similarity between words. And we introduced a method which can be used to calculate coherence of meaning-based paragraph by normalizing the sum of tf-idf value of words in the paragraph. We conducted experiments to prove the performance of the proposed technique by using the Reuter-21578 document set. The experimental result showed the document clustering technique of using meaning-based paragraphs improves the precision and the recall of document retrieval.
KW - Document retrieval;Clustering;Meaning-based paragraph;K-means clustering method;Reuter-21578 document set;Precision;recall
DO - 10.34163/jkits.2017.12.1.014
ER -
SAJOON PARK and Jae Ho Kim. (2017). Paragraph-based K-Means Clustering by using Meaning-based Paragraph Division. Journal of Knowledge Information Technology and Systems, 12(1), 157-164.
SAJOON PARK and Jae Ho Kim. 2017, "Paragraph-based K-Means Clustering by using Meaning-based Paragraph Division", Journal of Knowledge Information Technology and Systems, vol.12, no.1 pp.157-164. Available from: doi:10.34163/jkits.2017.12.1.014
SAJOON PARK, Jae Ho Kim "Paragraph-based K-Means Clustering by using Meaning-based Paragraph Division" Journal of Knowledge Information Technology and Systems 12.1 pp.157-164 (2017) : 157.
SAJOON PARK, Jae Ho Kim. Paragraph-based K-Means Clustering by using Meaning-based Paragraph Division. 2017; 12(1), 157-164. Available from: doi:10.34163/jkits.2017.12.1.014
SAJOON PARK and Jae Ho Kim. "Paragraph-based K-Means Clustering by using Meaning-based Paragraph Division" Journal of Knowledge Information Technology and Systems 12, no.1 (2017) : 157-164.doi: 10.34163/jkits.2017.12.1.014
SAJOON PARK; Jae Ho Kim. Paragraph-based K-Means Clustering by using Meaning-based Paragraph Division. Journal of Knowledge Information Technology and Systems, 12(1), 157-164. doi: 10.34163/jkits.2017.12.1.014
SAJOON PARK; Jae Ho Kim. Paragraph-based K-Means Clustering by using Meaning-based Paragraph Division. Journal of Knowledge Information Technology and Systems. 2017; 12(1) 157-164. doi: 10.34163/jkits.2017.12.1.014
SAJOON PARK, Jae Ho Kim. Paragraph-based K-Means Clustering by using Meaning-based Paragraph Division. 2017; 12(1), 157-164. Available from: doi:10.34163/jkits.2017.12.1.014
SAJOON PARK and Jae Ho Kim. "Paragraph-based K-Means Clustering by using Meaning-based Paragraph Division" Journal of Knowledge Information Technology and Systems 12, no.1 (2017) : 157-164.doi: 10.34163/jkits.2017.12.1.014