publications | Chenhui Gou

2026

Preprint

Sample-Efficient Learning from Agent Experience

Chenhui Gou, Haoqin Tu, Yunhao Fang, and 2 more authors

Jul 2026

@article{gou2026sampleefficient,
  title = {Sample-Efficient Learning from Agent Experience},
  author = {Gou, Chenhui and Tu, Haoqin and Fang, Yunhao and Cai, Jianfei and Rezatofighi, Hamid},
  month = jul,
  year = {2026},
}

CVPR

VQ-VA World: Towards High-Quality Visual Question-Visual Answering

Chenhui Gou^*, Zilong Chen^*, Zeyu Wang^*, and 10 more authors

In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR).

, 2026

Bib

@inproceedings{gou2026vqva,
  title = {VQ-VA World: Towards High-Quality Visual Question-Visual Answering},
  author = {Gou, Chenhui and Chen, Zilong and Wang, Zeyu and Li, Feng and Zhu, Deyao and Duan, Zicheng and Li, Kunchang and Deng, Chaorui and Yuan, Hongyi and Fan, Haoqi and Xie, Cihang and Cai, Jianfei and Rezatofighi, Hamid},
  year = {2026},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
}

CVPR

An Empirical Study on How Video-LLMs Answer Video Questions

Chenhui Gou, Ziyu Ma, Zicheng Duan, and 6 more authors

In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2026

Bib

@inproceedings{gou2026empirical,
  title = {An Empirical Study on How Video-LLMs Answer Video Questions},
  author = {Gou, Chenhui and Ma, Ziyu and Duan, Zicheng and He, Haoyu and Chen, Feng and Liu, Akide and Zhuang, Bohan and Cai, Jianfei and Rezatofighi, Hamid},
  year = {2026},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
}

AAAI

Where and What Matters: Sensitivity-Aware Task Vectors for Many-Shot Multimodal In-Context Learning

Ziyu Ma^*, Chenhui Gou^*, Yiming Hu, and 4 more authors

In AAAI Conference on Artificial Intelligence (AAAI), 2026

Bib

@inproceedings{ma2026sensitivity,
  title = {Where and What Matters: Sensitivity-Aware Task Vectors for Many-Shot Multimodal In-Context Learning},
  author = {Ma, Ziyu and Gou, Chenhui and Hu, Yiming and Wang, Yong and Chu, Xiangxiang and Zhuang, Bohan and Cai, Jianfei},
  year = {2026},
  booktitle = {AAAI Conference on Artificial Intelligence (AAAI)},
}

ICLR

Sparsity Forcing: Reinforcing Token Sparsity of MLLMs

Feng Chen, Yefei He, Lequan Lin, and 4 more authors

In International Conference on Learning Representations (ICLR), 2026

Bib

@inproceedings{chen2026sparsity,
  title = {Sparsity Forcing: Reinforcing Token Sparsity of MLLMs},
  author = {Chen, Feng and He, Yefei and Lin, Lequan and Gou, Chenhui and Liu, Jing and Zhuang, Bohan and Wu, Qi},
  year = {2026},
  booktitle = {International Conference on Learning Representations (ICLR)},
}

CVPR

Evaluating and Advancing Multimodal Large Language Models in Ability Lens

Feng Chen^*, Chenhui Gou, Jing Liu, and 6 more authors

In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Findings, 2026

Bib

@inproceedings{chen2026evaluating,
  title = {Evaluating and Advancing Multimodal Large Language Models in Ability Lens},
  author = {Chen, Feng and Gou, Chenhui and Liu, Jing and Yang, Yang and Li, Zhaoyang and Zhang, Jiyuan and Sun, Zhenbang and Zhuang, Bohan and Wu, Qi},
  year = {2026},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Findings},
}

2025

Tech Report

Seed1.5-VL Technical Report

ByteDance Seed Team

Contributor. , 2025

Bib

BAGEL

Emerging Properties in Unified Multimodal Pretraining

ByteDance BAGEL Team

Core contributor.

, May 2025

Bib Code

Preprint

LightBagel: A Light-Weighted, Double Fusion Framework for Unified Multimodal Understanding and Generation

Zeyu Wang^*, Zilong Chen^*, Chenhui Gou^*, and 8 more authors

2025

Bib

@article{wang2025lightbagel,
  title = {LightBagel: A Light-Weighted, Double Fusion Framework for Unified Multimodal Understanding and Generation},
  author = {Wang, Zeyu and Chen, Zilong and Gou, Chenhui and Li, Feng and Deng, Chaorui and Zhu, Deyao and Li, Kunchang and Yu, Weihao and Tu, Haoqin and Xie, Cihang and Fan, Haoqi},
  year = {2025},
}

Preprint

UniMedVL: Unifying Medical Multimodal Understanding and Generation Through Observation-Knowledge-Analysis

. Contributor. , 2025

Bib

CVPR

DrVideo: Document Retrieval Based Long Video Understanding

Ziyu Ma^*, Chenhui Gou^*, Hengcan Shi, and 4 more authors

In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2025

Bib

@inproceedings{ma2025drvideo,
  title = {DrVideo: Document Retrieval Based Long Video Understanding},
  author = {Ma, Ziyu and Gou, Chenhui and Shi, Hengcan and Sun, Bin and Li, Shutao and Rezatofighi, Hamid and Cai, Jianfei},
  year = {2025},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
}

EMNLP

InfiniBench: A Comprehensive Benchmark for Large Multimodal Models in Very Long Video Understanding

Kirolos Ataallah, Chenhui Gou, Eslam Mohamed Bakr, and 3 more authors

In Conference on Empirical Methods in Natural Language Processing (EMNLP), 2025

Bib

@inproceedings{ataallah2025infinibench,
  title = {InfiniBench: A Comprehensive Benchmark for Large Multimodal Models in Very Long Video Understanding},
  author = {Ataallah, Kirolos and Gou, Chenhui and Bakr, Eslam Mohamed and Pahwa, Khushbu and Ding, Jian and Elhoseiny, Mohamed},
  year = {2025},
  booktitle = {Conference on Empirical Methods in Natural Language Processing (EMNLP)},
}

Preprint

LiveWorld: Simulating Out-of-Sight Dynamics in Generative Video World Models

Zicheng Duan, Jiatong Xia, Zeyu Zhang, and 7 more authors

2025

Bib

@article{duan2025liveworld,
  title = {LiveWorld: Simulating Out-of-Sight Dynamics in Generative Video World Models},
  author = {Duan, Zicheng and Xia, Jiatong and Zhang, Zeyu and Zhang, Wenbo and Zhou, Gengze and Gou, Chenhui and He, Yefei and Chen, Feng and Zhang, Xinyu and Liu, Lingqiao},
  year = {2025},
}

CVPR

Point-Cache: Test-time Dynamic and Hierarchical Cache for Robust and Generalizable Point Cloud Analysis

Hongyu Sun, Qiuhong Ke, Ming Cheng, and 4 more authors

In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2025

Bib

@inproceedings{sun2025pointcache,
  title = {Point-Cache: Test-time Dynamic and Hierarchical Cache for Robust and Generalizable Point Cloud Analysis},
  author = {Sun, Hongyu and Ke, Qiuhong and Cheng, Ming and Wang, Yongcai and Li, Deying and Gou, Chenhui and Cai, Jianfei},
  year = {2025},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
}

2024

Preprint

Mobile-VideoGPT: Fast and Accurate Video Understanding Language Model

Abdelrahman Shaker, Muhammad Maaz, Chenhui Gou, and 3 more authors

2024

Bib

@article{shaker2024mobilevideogpt,
  title = {Mobile-VideoGPT: Fast and Accurate Video Understanding Language Model},
  author = {Shaker, Abdelrahman and Maaz, Muhammad and Gou, Chenhui and Rezatofighi, Hamid and Khan, Salman and Khan, Fahad Shahbaz},
  year = {2024},
}

Preprint

How Well Can Vision Language Models See Image Details?

Chenhui Gou, Faizan Khan, Deyao Zhu, and 4 more authors

2024

Bib

@article{gou2024howwell,
  title = {How Well Can Vision Language Models See Image Details?},
  author = {Gou, Chenhui and Khan, Faizan and Zhu, Deyao and Felemban, Abdulwahab and Cai, Jianfei and Rezatofighi, Hamid and Elhoseiny, Mohamed},
  year = {2024},
}

CVPR

JRDB-PanoTrack: An Open-World Panoptic Segmentation and Tracking Robotic Dataset in Crowded Human Environments

Duy-Tho Le^*, Chenhui Gou^*, Stavya Datta, and 4 more authors

In IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2024

Bib

@inproceedings{le2024jrdb,
  title = {JRDB-PanoTrack: An Open-World Panoptic Segmentation and Tracking Robotic Dataset in Crowded Human Environments},
  author = {Le, Duy-Tho and Gou, Chenhui and Datta, Stavya and Shi, Hengcan and Reid, Ian and Cai, Jianfei and Rezatofighi, Hamid},
  year = {2024},
  booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
}

2023

Preprint

Strong and Controllable Blind Image Decomposition

Zeyu Zhang^*, Junlin Han^*, Chenhui Gou^*, and 2 more authors

2023

Bib

@article{zhang2023blind,
  title = {Strong and Controllable Blind Image Decomposition},
  author = {Zhang, Zeyu and Han, Junlin and Gou, Chenhui and Li, Hongdong and Zheng, Liang},
  year = {2023},
}

2022

NeurIPS

RTFormer: Efficient Design for Real-Time Semantic Segmentation with Transformer

Jian Wang^*, Chenhui Gou^*, Qiman Wu^*, and 4 more authors

In Advances in Neural Information Processing Systems (NeurIPS). Spotlight Presentation , 2022

Bib

@inproceedings{wang2022rtformer,
  title = {RTFormer: Efficient Design for Real-Time Semantic Segmentation with Transformer},
  author = {Wang, Jian and Gou, Chenhui and Wu, Qiman and Feng, Haocheng and Han, Junyu and Ding, Errui and Wang, Jingdong},
  year = {2022},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
}