@article{Zhou2025, 
author = {Lebin Zhou and Cihan Ruan and Nam Ling and Zhenghao Chen and Wei Wang and Wei Jiang},
title = {TVC: tokenized video compression with ultra-low bit rate},
year = {2025},
journal = {Visual Intelligence},
volume = {3},
pages = {25},
keywords = {Deep learning, Video compression, Tokenization, Dual-stream architecture, Discrete-continuous, Neural codecs},
url = {https://www.sciopen.com/article/10.1007/s44267-025-00098-7},
doi = {10.1007/s44267-025-00098-7},
abstract = {Tokenized visual representations have shown promise in image compression, yet their extension to video remains underexplored due to the challenges posed by complex temporal dynamics and stringent bit rate constraints. In this paper, we present tokenized video compression (TVC), a token-based dual-stream framework designed to operate effectively at ultra-low bit rates. TVC leverages the Cosmos video tokenizer to extract both discrete and continuous token streams. The discrete tokens are partially masked using a strategic masking scheme and then compressed losslessly with a discrete checkerboard context model to reduce transmission overhead. The masked tokens are reconstructed by a decoder-only Transformer with spatiotemporal token prediction. In parallel, the continuous tokens are quantized and compressed using a continuous checkerboard context model, providing complementary continuous information at ultra-low bit rates. At the decoder side, the two streams are fused with a ControlNet-based multi-scale integration module, ensuring high perceptual quality alongside stable fidelity in reconstruction. Overall, this work illustrates the practicality of tokenized video compression and points to new directions for semantics-aware, token-native approaches.}
}