@article{Zeng2025, 
author = {Ziqian Zeng and Jiahong Yu and Qianshi Pang and Zihao Wang and Huiping Zhuang and Fan Yu and Hongen Shao and Xiaofeng Zou},
title = {ResDecode: Accelerating Large Language Models Inference via Residual Decoding Heads},
year = {2025},
journal = {Big Data Mining and Analytics},
volume = {8},
number = {4},
pages = {779-793},
keywords = {Large Language Models (LLMs), speculative decoding, efficient inference},
url = {https://www.sciopen.com/article/10.26599/BDMA.2024.9020074},
doi = {10.26599/BDMA.2024.9020074},
abstract = {Large language Models (LLMs) have immense potential to enhance the capabilities of Cyber-Physical-Social Intelligence (CPSI) systems, enabling them to better engage with complex cyber, physical, and social environments. However, the high inference latency of LLMs, which is inherited from the autoregressive decoding process, hinders their wide application in CPSI systems. To address this challenge, current approaches have incorporated speculative decoding to enable parallel prediction of multiple subsequent tokens, thereby achieving inference acceleration. Nevertheless, the accuracy of these decoding heads falls short of the autoregressive decoding approach. In light of these limitations, we propose ResDecode, a novel speculative decoding method characterized by its efficient and accurate decoding heads. Within the lightweight draft model, we propose a residual decoding head to compensate for the full context encoder’s limited capability on long-range dependencies, thus improving accuracy. ResDecode demonstrates impressive results, achieving a maximum speedup ratio of 3.2 × on the MT-bench compared to vanilla autoregressive decoding.}
}