@article{ART003152988},
author={Young-Seob Jeong and Yeong-Jin Kim and Medard Edmund Mswahili and Jiyoung Woo and Kang Ah Reum},
title={Malware Byte Stream Analysis Using Overlapped LDA},
journal={Journal of The Korea Society of Computer and Information},
issn={1598-849X},
year={2024},
volume={29},
number={12},
pages={109-119}
TY - JOUR
AU - Young-Seob Jeong
AU - Yeong-Jin Kim
AU - Medard Edmund Mswahili
AU - Jiyoung Woo
AU - Kang Ah Reum
TI - Malware Byte Stream Analysis Using Overlapped LDA
JO - Journal of The Korea Society of Computer and Information
PY - 2024
VL - 29
IS - 12
PB - The Korean Society Of Computer And Information
SP - 109
EP - 119
SN - 1598-849X
AB - More documents are appearing in online platforms, and people are vulnerable to malicious attacks in non-executable documents. Recently data-driven approaches have shown successful results in malware detection task. As they heavily rely on the dataset, it is important to make a lot of annotated data, while the annotation process is normally performed manually by domain experts. Therefore, it is necessary to develop a system or a tool that analyzes the files and help the annotation process. In this paper, we propose a new method that automatically analyzes files and generates byte-level labels using a modified version of overlapped dirichlet allocation that clusters given bytes into two (e.g., malware and benign) or more groups. By experimental results with our annotated dataset, we demonstrated that the generated byte-level labels achieved high recall (95~100%). We observed that our model suffered from low precision because the dataset is sparsely annotated, but it still has a potential to aid in finding suspicious bytes for malware analysis. We also provide sample results visualized by highlights with different colors.
KW - malware analysis;topic model;overlapped latent dirichlet allocation;byte stream
DO -
UR -
ER -
Young-Seob Jeong, Yeong-Jin Kim, Medard Edmund Mswahili, Jiyoung Woo and Kang Ah Reum. (2024). Malware Byte Stream Analysis Using Overlapped LDA. Journal of The Korea Society of Computer and Information, 29(12), 109-119.
Young-Seob Jeong, Yeong-Jin Kim, Medard Edmund Mswahili, Jiyoung Woo and Kang Ah Reum. 2024, "Malware Byte Stream Analysis Using Overlapped LDA", Journal of The Korea Society of Computer and Information, vol.29, no.12 pp.109-119.
Young-Seob Jeong, Yeong-Jin Kim, Medard Edmund Mswahili, Jiyoung Woo, Kang Ah Reum "Malware Byte Stream Analysis Using Overlapped LDA" Journal of The Korea Society of Computer and Information 29.12 pp.109-119 (2024) : 109.
Young-Seob Jeong, Yeong-Jin Kim, Medard Edmund Mswahili, Jiyoung Woo, Kang Ah Reum. Malware Byte Stream Analysis Using Overlapped LDA. 2024; 29(12), 109-119.
Young-Seob Jeong, Yeong-Jin Kim, Medard Edmund Mswahili, Jiyoung Woo and Kang Ah Reum. "Malware Byte Stream Analysis Using Overlapped LDA" Journal of The Korea Society of Computer and Information 29, no.12 (2024) : 109-119.
Young-Seob Jeong; Yeong-Jin Kim; Medard Edmund Mswahili; Jiyoung Woo; Kang Ah Reum. Malware Byte Stream Analysis Using Overlapped LDA. Journal of The Korea Society of Computer and Information, 29(12), 109-119.
Young-Seob Jeong; Yeong-Jin Kim; Medard Edmund Mswahili; Jiyoung Woo; Kang Ah Reum. Malware Byte Stream Analysis Using Overlapped LDA. Journal of The Korea Society of Computer and Information. 2024; 29(12) 109-119.
Young-Seob Jeong, Yeong-Jin Kim, Medard Edmund Mswahili, Jiyoung Woo, Kang Ah Reum. Malware Byte Stream Analysis Using Overlapped LDA. 2024; 29(12), 109-119.
Young-Seob Jeong, Yeong-Jin Kim, Medard Edmund Mswahili, Jiyoung Woo and Kang Ah Reum. "Malware Byte Stream Analysis Using Overlapped LDA" Journal of The Korea Society of Computer and Information 29, no.12 (2024) : 109-119.