@article{ART003349330},
author={Min-Ju Kang and Jin-Young Kim},
title={A Defensive Data Preprocessing Pipeline for Mitigating Data Poisoning in AI Code Generators},
journal={Journal of The Korea Society of Computer and Information},
issn={1598-849X},
year={2026},
volume={31},
number={6},
pages={153-159}
TY - JOUR
AU - Min-Ju Kang
AU - Jin-Young Kim
TI - A Defensive Data Preprocessing Pipeline for Mitigating Data Poisoning in AI Code Generators
JO - Journal of The Korea Society of Computer and Information
PY - 2026
VL - 31
IS - 6
PB - The Korean Society Of Computer And Information
SP - 153
EP - 159
SN - 1598-849X
AB - The rapid adoption of Large Language Model-based AI code generators has led to an escalation in data poisoning attacks. While previous research primarily focuses on post-training defense mechanisms, preventative measures at the training data level remain scarce; as a result, data poisoning continues to exert a direct and persistent influence on the internal representations of these models. Therefore, this study suggests a defensive data preprocessing pipeline to address data poisoning attacks on AI code generators. The pipeline leverages a code language model to quantify distributional anomaly and consistency scores for each data point, which are then integrated with CVSS scores to determine final risk levels, thereby enabling the systematic identification and removal of high-risk data to enhance the overall reliability of the training set. As a result, the application of this pipeline led to an approximately 75% reduction in the Attack Success Rate (ASR) relative to the baseline. Ultimately, these findings demonstrate that the proposed preprocessing pipeline effectively mitigates data poisoning and highlights the practical viability of a data-centric defense strategy.
KW - Data Poisoning;Data-centric Security;Data Preprocessing;AI Code Generators;CVSS Score
DO -
UR -
ER -
Min-Ju Kang and Jin-Young Kim. (2026). A Defensive Data Preprocessing Pipeline for Mitigating Data Poisoning in AI Code Generators. Journal of The Korea Society of Computer and Information, 31(6), 153-159.
Min-Ju Kang and Jin-Young Kim. 2026, "A Defensive Data Preprocessing Pipeline for Mitigating Data Poisoning in AI Code Generators", Journal of The Korea Society of Computer and Information, vol.31, no.6 pp.153-159.
Min-Ju Kang, Jin-Young Kim "A Defensive Data Preprocessing Pipeline for Mitigating Data Poisoning in AI Code Generators" Journal of The Korea Society of Computer and Information 31.6 pp.153-159 (2026) : 153.
Min-Ju Kang, Jin-Young Kim. A Defensive Data Preprocessing Pipeline for Mitigating Data Poisoning in AI Code Generators. 2026; 31(6), 153-159.
Min-Ju Kang and Jin-Young Kim. "A Defensive Data Preprocessing Pipeline for Mitigating Data Poisoning in AI Code Generators" Journal of The Korea Society of Computer and Information 31, no.6 (2026) : 153-159.
Min-Ju Kang; Jin-Young Kim. A Defensive Data Preprocessing Pipeline for Mitigating Data Poisoning in AI Code Generators. Journal of The Korea Society of Computer and Information, 31(6), 153-159.
Min-Ju Kang; Jin-Young Kim. A Defensive Data Preprocessing Pipeline for Mitigating Data Poisoning in AI Code Generators. Journal of The Korea Society of Computer and Information. 2026; 31(6) 153-159.
Min-Ju Kang, Jin-Young Kim. A Defensive Data Preprocessing Pipeline for Mitigating Data Poisoning in AI Code Generators. 2026; 31(6), 153-159.
Min-Ju Kang and Jin-Young Kim. "A Defensive Data Preprocessing Pipeline for Mitigating Data Poisoning in AI Code Generators" Journal of The Korea Society of Computer and Information 31, no.6 (2026) : 153-159.