@article{ART003305939},
author={Areum Im and Taehwa Lee and Soojin Lee},
title={A Novel Two-Stage Attacks on Korean Language Models: Single- Token Triggers Search and Morphology-Preserving Minimal Edits},
journal={Journal of The Korea Society of Computer and Information},
issn={1598-849X},
year={2026},
volume={31},
number={2},
pages={75-85},
doi={10.9708/jksci.2026.31.02.075}
TY - JOUR
AU - Areum Im
AU - Taehwa Lee
AU - Soojin Lee
TI - A Novel Two-Stage Attacks on Korean Language Models: Single- Token Triggers Search and Morphology-Preserving Minimal Edits
JO - Journal of The Korea Society of Computer and Information
PY - 2026
VL - 31
IS - 2
PB - The Korean Society Of Computer And Information
SP - 75
EP - 85
SN - 1598-849X
AB - In this study, we propose a novel two-stage attack framework applicable to Korean-based language models with agglutinative characteristics. The first stage is an inference-time universal adversarial trigger (UAT) attack, performed without intervention in the learning process. It precisely searches for single-token triggers capable of reversing the model's predictions using only the gradient information. The second stage, targeting only samples that failed in the first stage, is an adversarial example attack. It replaces no more than two tokens combining particles and suffixes based on a morphology-preserving minimal edit strategy. The effectiveness of our framework was evaluated on the NSMC dataset using the KoBERT and KoELECTRA models. Experimental results showed that triggers attached to the end of sentences had a high attack success rate due to the characteristic of Korean language in which key information appears at the end of sentences. Furthermore, words that indirectly express sentiment also functioned as powerful triggers. The KoBERT model achieved an attack success rate of 0.963, and the KoELECTRA model achieved an attack success rate of 0.940.
KW - Adversarial example;Universal adversarial trigger;Korean language model
DO - 10.9708/jksci.2026.31.02.075
ER -
Areum Im, Taehwa Lee and Soojin Lee. (2026). A Novel Two-Stage Attacks on Korean Language Models: Single- Token Triggers Search and Morphology-Preserving Minimal Edits. Journal of The Korea Society of Computer and Information, 31(2), 75-85.
Areum Im, Taehwa Lee and Soojin Lee. 2026, "A Novel Two-Stage Attacks on Korean Language Models: Single- Token Triggers Search and Morphology-Preserving Minimal Edits", Journal of The Korea Society of Computer and Information, vol.31, no.2 pp.75-85. Available from: doi:10.9708/jksci.2026.31.02.075
Areum Im, Taehwa Lee, Soojin Lee "A Novel Two-Stage Attacks on Korean Language Models: Single- Token Triggers Search and Morphology-Preserving Minimal Edits" Journal of The Korea Society of Computer and Information 31.2 pp.75-85 (2026) : 75.
Areum Im, Taehwa Lee, Soojin Lee. A Novel Two-Stage Attacks on Korean Language Models: Single- Token Triggers Search and Morphology-Preserving Minimal Edits. 2026; 31(2), 75-85. Available from: doi:10.9708/jksci.2026.31.02.075
Areum Im, Taehwa Lee and Soojin Lee. "A Novel Two-Stage Attacks on Korean Language Models: Single- Token Triggers Search and Morphology-Preserving Minimal Edits" Journal of The Korea Society of Computer and Information 31, no.2 (2026) : 75-85.doi: 10.9708/jksci.2026.31.02.075
Areum Im; Taehwa Lee; Soojin Lee. A Novel Two-Stage Attacks on Korean Language Models: Single- Token Triggers Search and Morphology-Preserving Minimal Edits. Journal of The Korea Society of Computer and Information, 31(2), 75-85. doi: 10.9708/jksci.2026.31.02.075
Areum Im; Taehwa Lee; Soojin Lee. A Novel Two-Stage Attacks on Korean Language Models: Single- Token Triggers Search and Morphology-Preserving Minimal Edits. Journal of The Korea Society of Computer and Information. 2026; 31(2) 75-85. doi: 10.9708/jksci.2026.31.02.075
Areum Im, Taehwa Lee, Soojin Lee. A Novel Two-Stage Attacks on Korean Language Models: Single- Token Triggers Search and Morphology-Preserving Minimal Edits. 2026; 31(2), 75-85. Available from: doi:10.9708/jksci.2026.31.02.075
Areum Im, Taehwa Lee and Soojin Lee. "A Novel Two-Stage Attacks on Korean Language Models: Single- Token Triggers Search and Morphology-Preserving Minimal Edits" Journal of The Korea Society of Computer and Information 31, no.2 (2026) : 75-85.doi: 10.9708/jksci.2026.31.02.075