Year | Title | Venue | Paper | Code | Project Page |
---|---|---|---|---|---|
2024 | MicroCinema:A Divide-and-Conquer Approach for Text-to-Video Generation | CVPR 2024 (Highlight) | Link | -- | Link |
2024 | LivePhoto: Real Image Animation with Text-guided Motion Control | ECCV 2024 | Link | Link | Link |
2024 | xGen-VideoSyn-1: High-fidelity Text-to-Video Synthesis with Compressed Representations | ECCV 2024 AI4VA | Link | Link | -- |
2024 | MotionBooth: Motion-Aware Customized Text-to-Video Generation | NeurIPS 2024 Spotlight | Link | Link | Link |
2024 | Vivid-ZOO: Multi-View Video Generation with Diffusion Model | NeurIPS 2024 | Link | Link | Link |
2024 | Enhancing Motion in Text-to-Video Generation with Decomposed Encoding and Conditioning | NeurIPS 2024 | Link | Link | Link |
2024 | VideoDirectorGPT: Consistent Multi-scene Video Generation via LLM-Guided Planning | COLM 2024 | Link | Link | Link |
2024 | Still-Moving: Customized Video Generation without Customized Video Data | SIGGRAPH Asia Journal 2024 | Link | -- | Link |
2024 | Text-Animator: Controllable Visual Text Video Generation | AAAI 2025 | Link | Link | Link |
2024 | 3DTrajMaster: Mastering 3D Trajectory for Multi-Entity Motion in Video Generation | ICLR 2025 | Link | Link | Link |
2024 | Pyramidal Flow Matching for Efficient Video Generative Modeling | ICLR 2025 | Link | Link | Link |
2024 | CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer | ICLR 2025 | Link | Link | Hugging Face |
2024 | PhyT2V: LLM-Guided Iterative Self-Refinement for Physics-Grounded Text-to-Video Generation | CVPR 2025 | Link | Link | -- |
2024 | Motion Prompting: Controlling Video Generation with Motion Trajectories | CVPR 2025 | Link | -- | Link |
2024 | ByTheWay: Boost Your Text-to-Video Generation Model to Higher Quality in a Training-free Way | CVPR 2025 | Link | Link | -- |
2024 | Tora: Trajectory-oriented Diffusion Transformer for Video Generation | CVPR 2025 | Link | Link | Link |
2024 | StreamingT2V: Consistent, Dynamic, and Extendable Long Video Generation from Text | CVPR 2025 | Link | Link | Link |
2024 | AnimateAnything: Consistent and Controllable Animation for video generation | CVPR 2025 | Link | Link | Link |
2024 | FlipSketch: Flipping Static Drawings to Text-Guided Sketch Animations | CVPR 2025 | Link | Link | -- |
2024 | Mind the Time: Temporally-Controlled Multi-Event Video Generation | CVPR 2025 | Link | -- | Link |
Accepted Papers References
%accepted papers
@inproceedings{wang2024microcinema,
title={Microcinema: A divide-and-conquer approach for text-to-video generation},
author={Wang, Yanhui and Bao, Jianmin and Weng, Wenming and Feng, Ruoyu and Yin, Dacheng and Yang, Tao and Zhang, Jingxu and Dai, Qi and Zhao, Zhiyuan and Wang, Chunyu and others},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={8414--8424},
year={2024}
}
@article{chen2023livephoto,
title={LivePhoto: Real Image Animation with Text-guided Motion Control},
author={Chen, Xi and Liu, Zhiheng and Chen, Mengting and Feng, Yutong and Liu, Yu and Shen, Yujun and Zhao, Hengshuang},
journal={arXiv preprint arXiv:2312.02928},
year={2023}
}
@misc{qin2024xgenvideosyn1highfidelitytexttovideosynthesis,
title={xGen-VideoSyn-1: High-fidelity Text-to-Video Synthesis with Compressed Representations},
author={Can Qin and Congying Xia and Krithika Ramakrishnan and Michael Ryoo and Lifu Tu and Yihao Feng and Manli Shu and Honglu Zhou and Anas Awadalla and Jun Wang and Senthil Purushwalkam and Le Xue and Yingbo Zhou and Huan Wang and Silvio Savarese and Juan Carlos Niebles and Zeyuan Chen and Ran Xu and Caiming Xiong},
year={2024},
eprint={2408.12590},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2408.12590},
}
@article{wu2024motionbooth,
title={MotionBooth: Motion-Aware Customized Text-to-Video Generation},
author={Jianzong Wu and Xiangtai Li and Yanhong Zeng and Jiangning Zhang and Qianyu Zhou and Yining Li and Yunhai Tong and Kai Chen},
journal={arXiv pre-print arXiv:2406.17758},
year={2024},
}
@misc{li2024vividzoo,
title={Vivid-ZOO: Multi-View Video Generation with Diffusion Model},
author={Bing Li and Cheng Zheng and Wenxuan Zhu and Jinjie Mai and Biao Zhang and Peter Wonka and Bernard Ghanem},
year={2024},
eprint={2406.08659},
archivePrefix={arXiv},
}
@misc{ruan2024enhancingmotiontexttovideogeneration,
title={Enhancing Motion in Text-to-Video Generation with Decomposed Encoding and Conditioning},
author={Penghui Ruan and Pichao Wang and Divya Saxena and Jiannong Cao and Yuhui Shi},
year={2024},
eprint={2410.24219},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2410.24219},
}
@article{Lin2023VideoDirectorGPT,
author = {Han Lin and Abhay Zala and Jaemin Cho and Mohit Bansal},
title = {VideoDirectorGPT: Consistent Multi-Scene Video Generation via LLM-Guided Planning},
year = {2023},
}
@article{chefer2024still,
title={Still-moving: Customized video generation without customized video data},
author={Chefer, Hila and Zada, Shiran and Paiss, Roni and Ephrat, Ariel and Tov, Omer and Rubinstein, Michael and Wolf, Lior and Dekel, Tali and Michaeli, Tomer and Mosseri, Inbar},
journal={ACM Transactions on Graphics (TOG)},
volume={43},
number={6},
pages={1--11},
year={2024},
publisher={ACM New York, NY, USA}
}
@article{liu2024text,
title={Text-Animator: Controllable Visual Text Video Generation},
author={Liu, Lin and Liu, Quande and Qian, Shengju and Zhou, Yuan and Zhou, Wengang and Li, Houqiang and Xie, Lingxi and Tian, Qi},
journal={arXiv preprint arXiv:2406.17777},
year={2024}
}
@article{fu20243dtrajmaster,
title={3DTrajMaster: Mastering 3D Trajectory for Multi-Entity Motion in Video Generation},
author={Fu, Xiao and Liu, Xian and Wang, Xintao and Peng, Sida and Xia, Menghan and Shi, Xiaoyu and Yuan, Ziyang and Wan, Pengfei and Zhang, Di and Lin, Dahua},
journal={arXiv preprint arXiv:2412.07759},
year={2024}
}
@article{jin2024pyramidal,
title={Pyramidal Flow Matching for Efficient Video Generative Modeling},
author={Jin, Yang and Sun, Zhicheng and Li, Ningyuan and Xu, Kun and Xu, Kun and Jiang, Hao and Zhuang, Nan and Huang, Quzhe and Song, Yang and Mu, Yadong and Lin, Zhouchen},
jounal={arXiv preprint arXiv:2410.05954},
year={2024}
}
@misc{yang2024cogvideoxtexttovideodiffusionmodels,
title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
author={Zhuoyi Yang and Jiayan Teng and Wendi Zheng and Ming Ding and Shiyu Huang and Jiazheng Xu and Yuanming Yang and Wenyi Hong and Xiaohan Zhang and Guanyu Feng and Da Yin and Xiaotao Gu and Yuxuan Zhang and Weihan Wang and Yean Cheng and Ting Liu and Bin Xu and Yuxiao Dong and Jie Tang},
year={2024},
eprint={2408.06072},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2408.06072},
}
@misc{xue2024phyt2vllmguidediterativeselfrefinement,
title={PhyT2V: LLM-Guided Iterative Self-Refinement for Physics-Grounded Text-to-Video Generation},
author={Qiyao Xue and Xiangyu Yin and Boyuan Yang and Wei Gao},
year={2024},
eprint={2412.00596},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2412.00596},
}
@article{geng2024motionprompting,
author = {Geng, Daniel and Herrmann, Charles and Hur, Junhwa and Cole, Forrester and Zhang, Serena and Pfaff, Tobias and Lopez-Guevara, Tatiana and Doersch, Carl and Aytar, Yusuf and Rubinstein, Michael and Sun, Chen and Wang, Oliver and Owens, Andrew and Sun, Deqing},
title = {Motion Prompting: Controlling Video Generation with Motion Trajectories},
journal = {arXiv preprint arXiv:2412.02700},
year = {2024},
}
@article{bu2024broadway,
title={Broadway: Boost your text-to-video generation model in a training-free way},
author={Bu, Jiazi and Ling, Pengyang and Zhang, Pan and Wu, Tong and Dong, Xiaoyi and Zang, Yuhang and Cao, Yuhang and Lin, Dahua and Wang, Jiaqi},
journal={arXiv preprint arXiv:2410.06241},
year={2024}
}
@misc{tora,
title={Tora: Trajectory-oriented Diffusion Transformer for Video Generation},
author={Zhenghao Zhang and Junchao Liao and Menghao Li and Long Qin and Weizhi Wang},
year={2024},
eprint={2407.21705},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2407.21705},
}
@article{henschel2024streamingt2v,
title={StreamingT2V: Consistent, Dynamic, and Extendable Long Video Generation from Text},
author={Henschel, Roberto and Khachatryan, Levon and Hayrapetyan, Daniil and Poghosyan, Hayk and Tadevosyan, Vahram and Wang, Zhangyang and Navasardyan, Shant and Shi, Humphrey},
journal={arXiv preprint arXiv:2403.14773},
year={2024}
}
@article{lei2024animateanything,
title={Animateanything: Consistent and controllable animation for video generation},
author={Lei, Guojun and Wang, Chi and Li, Hong and Zhang, Rong and Wang, Yikai and Xu, Weiwei},
journal={arXiv preprint arXiv:2411.10836},
year={2024}
}
@article{bandyopadhyay2024flipsketch,
title={FlipSketch: Flipping Static Drawings to Text-Guided Sketch Animations},
author={Bandyopadhyay, Hmrishav and Song, Yi-Zhe},
journal={arXiv preprint arXiv:2411.10818},
year={2024}
}
@article{MinT,
title={Mind the Time: Temporally-Controlled Multi-Event Video Generation},
author={Wu, Ziyi and Siarohin, Aliaksandr and Menapace, Willi and Skorokhodov, Ivan and Fang, Yuwei and Chordia, Varnith and Gilitschenski, Igor and Tulyakov, Sergey},
journal={arXiv preprint arXiv:2412.05263},
year={2024}
}
Lin Liu, Quande Liu, Shengju Qian, Yuan Zhou, Wengang Zhou, Houqiang Li, Lingxi Xie, Qi Tian
(University of Science and Technology of China, Tencent, Nanyang Technical University, Huawei Tech)
Abstract
Video generation is a challenging yet pivotal task in various industries, such as gaming, e-commerce, and advertising. One significant unresolved aspect within T2V is the effective visualization of text within generated videos. Despite the progress achieved in Text-to-Video~(T2V) generation, current methods still cannot effectively visualize texts in videos directly, as they mainly focus on summarizing semantic scene information, understanding, and depicting actions. While recent advances in image-level visual text generation show promise, transitioning these techniques into the video domain faces problems, notably in preserving textual fidelity and motion coherence. In this paper, we propose an innovative approach termed Text-Animator for visual text video generation. Text-Animator contains a text embedding injection module to precisely depict the structures of visual text in generated videos. Besides, we develop a camera control module and a text refinement module to improve the stability of generated visual text by controlling the camera movement as well as the motion of visualized text. Quantitative and qualitative experimental results demonstrate the superiority of our approach to the accuracy of generated visual text over state-of-the-art video generation methods.Haoxuan Che, Xuanhua He, Quande Liu, Cheng Jin, Hao Chen
(Hong Kong Univerity of Science and Technology, Univerity of Science and Technology of China, The Chinese Univerity of Hong Kong)
Abstract
We introduce GameGen-X, the first diffusion transformer model specifically designed for both generating and interactively controlling open-world game videos. This model facilitates high-quality, open-domain generation by simulating an extensive array of game engine features, such as innovative characters, dynamic environments, complex actions, and diverse events. Additionally, it provides interactive controllability, predicting and altering future content based on the current clip, thus allowing for gameplay simulation. To realize this vision, we first collected and built an Open-World Video Game Dataset from scratch. It is the first and largest dataset for open-world game video generation and control, which comprises over a million diverse gameplay video clips sampling from over 150 games with informative captions from GPT-4o. GameGen-X undergoes a two-stage training process, consisting of foundation model pre-training and instruction tuning. Firstly, the model was pre-trained via text-to-video generation and video continuation, endowing it with the capability for long-sequence, high-quality open-domain game video generation. Further, to achieve interactive controllability, we designed InstructNet to incorporate game-related multi-modal control signal experts. This allows the model to adjust latent representations based on user inputs, unifying character interaction and scene content control for the first time in video generation. During instruction tuning, only the InstructNet is updated while the pre-trained foundation model is frozen, enabling the integration of interactive controllability without loss of diversity and quality of generated video content.Qiang Zhou, Shaofeng Zhang, Nianzu Yang, Ye Qian, Hao Li
(INF Tech., Shanghai Jiao Tong University, Fudan University)
Abstract
Existing text-to-video (T2V) models often struggle with generating videos with sufficiently pronounced or complex actions. A key limitation lies in the text prompt's inability to precisely convey intricate motion details. To address this, we propose a novel framework, MVideo, designed to produce long-duration videos with precise, fluid actions. MVideo overcomes the limitations of text prompts by incorporating mask sequences as an additional motion condition input, providing a clearer, more accurate representation of intended actions. Leveraging foundational vision models such as GroundingDINO and SAM2, MVideo automatically generates mask sequences, enhancing both efficiency and robustness. Our results demonstrate that, after training, MVideo effectively aligns text prompts with motion conditions to produce videos that simultaneously meet both criteria. This dual control mechanism allows for more dynamic video generation by enabling alterations to either the text prompt or motion condition independently, or both in tandem. Furthermore, MVideo supports motion condition editing and composition, facilitating the generation of videos with more complex actions. MVideo thus advances T2V motion generation, setting a strong benchmark for improved action depiction in current video diffusion models.4. DreamRunner: Fine-Grained Storytelling Video Generation with Retrieval-Augmented Motion Adaptation
Zun Wang, Jialu Li, Han Lin, Jaehong Yoon, Mohit Bansal
(University of North Carolina, Chapel Hill)
Abstract
Storytelling video generation (SVG) has recently emerged as a task to create long, multi-motion, multi-scene videos that consistently represent the story described in the input text script. SVG holds great potential for diverse content creation in media and entertainment; however, it also presents significant challenges: (1) objects must exhibit a range of fine-grained, complex motions, (2) multiple objects need to appear consistently across scenes, and (3) subjects may require multiple motions with seamless transitions within a single scene. To address these challenges, we propose DreamRunner, a novel story-to-video generation method: First, we structure the input script using a large language model (LLM) to facilitate both coarse-grained scene planning as well as fine-grained object-level layout and motion planning. Next, DreamRunner presents retrieval-augmented test-time adaptation to capture target motion priors for objects in each scene, supporting diverse motion customization based on retrieved videos, thus facilitating the generation of new videos with complex, scripted motions. Lastly, we propose a novel spatial-temporal region-based 3D attention and prior injection module SR3AI for fine-grained object-motion binding and frame-by-frame semantic control. We compare DreamRunner with various SVG baselines, demonstrating state-of-the-art performance in character consistency, text alignment, and smooth transitions. Additionally, DreamRunner exhibits strong fine-grained condition-following ability in compositional text-to-video generation, significantly outperforming baselines on T2V-ComBench. Finally, we validate DreamRunner's robust ability to generate multi-object interactions with qualitative examples.Dongjie Fu (Mogo AI)
Abstract
In the field of text-to-motion generation, Bert-type Masked Models (MoMask, MMM) currently produce higher-quality outputs compared to GPT-type autoregressive models (T2M-GPT). However, these Bert-type models often lack the streaming output capability required for applications in video game and multimedia environments, a feature inherent to GPT-type models. Additionally, they demonstrate weaker performance in out-of-distribution generation. To surpass the quality of BERT-type models while leveraging a GPT-type structure, without adding extra refinement models that complicate scaling data, we propose a novel architecture, Mogo (Motion Only Generate Once), which generates high-quality lifelike 3D human motions by training a single transformer model. Mogo consists of only two main components: 1) RVQ-VAE, a hierarchical residual vector quantization variational autoencoder, which discretizes continuous motion sequences with high precision; 2) Hierarchical Causal Transformer, responsible for generating the base motion sequences in an autoregressive manner while simultaneously inferring residuals across different layers. Experimental results demonstrate that Mogo can generate continuous and cyclic motion sequences up to 260 frames (13 seconds), surpassing the 196 frames (10 seconds) length limitation of existing datasets like HumanML3D. On the HumanML3D test set, Mogo achieves a FID score of 0.079, outperforming both the GPT-type model T2M-GPT (FID = 0.116), AttT2M (FID = 0.112) and the BERT-type model MMM (FID = 0.080). Furthermore, our model achieves the best quantitative performance in out-of-distribution generation.Zongyu Lin, Wei Liu, Chen Chen, Jiasen Lu, Wenze Hu, Tsu-Jui Fu, Jesse Allardice, Zhengfeng Lai, Liangchen Song, Bowen Zhang, Cha Chen, Yiran Fei, Yifan Jiang, Lezhi Li, Yizhou Sun, Kai-Wei Chang, Yinfei Yang
(Apple, UCLA)
Abstract
The field of video generation has made remarkable advancements, yet there remains a pressing need for a clear, systematic recipe that can guide the development of robust and scalable models. In this work, we present a comprehensive study that systematically explores the interplay of model architectures, training recipes, and data curation strategies, culminating in a simple and scalable text-image-conditioned video generation method, named STIV. Our framework integrates image condition into a Diffusion Transformer (DiT) through frame replacement, while incorporating text conditioning via a joint image-text conditional classifier-free guidance. This design enables STIV to perform both text-to-video (T2V) and text-image-to-video (TI2V) tasks simultaneously. Additionally, STIV can be easily extended to various applications, such as video prediction, frame interpolation, multi-view generation, and long video generation, etc. With comprehensive ablation studies on T2I, T2V, and TI2V, STIV demonstrate strong performance, despite its simple design. An 8.7B model with 512 resolution achieves 83.1 on VBench T2V, surpassing both leading open and closed-source models like CogVideoX-5B, Pika, Kling, and Gen-3. The same-sized model also achieves a state-of-the-art result of 90.1 on VBench I2V task at 512 resolution. By providing a transparent and extensible recipe for building cutting-edge video generation models, we aim to empower future research and accelerate progress toward more versatile and reliable video generation solutions.Yuval Atzmon, Rinon Gal, Yoad Tewel, Yoni Kasten, Gal Chechik (Nvidia)
Abstract
Text-to-video models have made significant strides in generating short video clips from textual descriptions. Yet, a significant challenge remains: generating several video shots of the same characters, preserving their identity without hurting video quality, dynamics, and responsiveness to text prompts. We present Video Storyboarding, a training-free method to enable pretrained text-to-video models to generate multiple shots with consistent characters, by sharing features between them. Our key insight is that self-attention query features (Q) encode both motion and identity. This creates a hard-to-avoid trade-off between preserving character identity and making videos dynamic, when features are shared. To address this issue, we introduce a novel query injection strategy that balances identity preservation and natural motion retention. This approach improves upon naive consistency techniques applied to videos, which often struggle to maintain this delicate equilibrium. Our experiments demonstrate significant improvements in character consistency across scenes while maintaining high-quality motion and text alignment. These results offer insights into critical stages of video generation and the interplay of structure and motion in video diffusion models.Year | Title | ArXiv Time | Paper | Code | Project Page |
---|---|---|---|---|---|
2024 | Text-Animator: Controllable Visual Text Video Generation | 25 Jun 2024 | Link | Link | Link |
2024 | GameGen-X: Interactive Open-world Game Video Generation | 1 Nov 2024 | Link | Link | Link |
2024 | MVideo: Motion Control for Enhanced Complex Action Video Generation | 13 Nov 2024 | Link | -- | Link |
2024 | DreamRunner: Fine-Grained Storytelling Video Generation with Retrieval-Augmented Motion Adaptation | 25 Nov 2024 | Link | Link | Link |
2024 | Mogo: RQ Hierarchical Causal Transformer for High-Quality 3D Human Motion Generation | 5 Dec 2024 | Link | -- | -- |
2024 | STIV: Scalable Text and Image Conditioned Video Generation | 10 Dec 2024 | Link | -- | -- |
2024 | Video Storyboarding: Multi-Shot Character Consistency for Text-to-Video Generation | 10 Dec 2024 | Link | -- | Link |
ArXiv Papers References
%axiv papers
@article{liu2024text,
title={Text-Animator: Controllable Visual Text Video Generation},
author={Liu, Lin and Liu, Quande and Qian, Shengju and Zhou, Yuan and Zhou, Wengang and Li, Houqiang and Xie, Lingxi and Tian, Qi},
journal={arXiv preprint arXiv:2406.17777},
year={2024}
}
@misc{che2024gamegenxinteractiveopenworldgame,
title={GameGen-X: Interactive Open-world Game Video Generation},
author={Haoxuan Che and Xuanhua He and Quande Liu and Cheng Jin and Hao Chen},
year={2024},
eprint={2411.00769},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2411.00769},
}
@misc{zhou2024motioncontrolenhancedcomplex,
title={Motion Control for Enhanced Complex Action Video Generation},
author={Qiang Zhou and Shaofeng Zhang and Nianzu Yang and Ye Qian and Hao Li},
year={2024},
eprint={2411.08328},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2411.08328},
}
@article{zun2024dreamrunner,
author = {Zun Wang and Jialu Li and Han Lin and Jaehong Yoon and Mohit Bansal},
title = {DreamRunner: Fine-Grained Storytelling Video Generation with Retrieval-Augmented Motion Adaptation},
journal = {arxiv},
year = {2024},
url = {https://arxiv.org/abs/2411.16657}
}
@misc{fu2024mogorqhierarchicalcausal,
title={Mogo: RQ Hierarchical Causal Transformer for High-Quality 3D Human Motion Generation},
author={Dongjie Fu},
year={2024},
eprint={2412.07797},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2412.07797},
}
@misc{lin2024stivscalabletextimage,
title={STIV: Scalable Text and Image Conditioned Video Generation},
author={Zongyu Lin and Wei Liu and Chen Chen and Jiasen Lu and Wenze Hu and Tsu-Jui Fu and Jesse Allardice and Zhengfeng Lai and Liangchen Song and Bowen Zhang and Cha Chen and Yiran Fei and Yifan Jiang and Lezhi Li and Yizhou Sun and Kai-Wei Chang and Yinfei Yang},
year={2024},
eprint={2412.07730},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2412.07730},
}
@article{atzmon2024multi,
title = {Multi-Shot Character Consistency for Text-to-Video Generation},
author = {
Atzmon, Yuval and Gal, Rinon and Tewel, Yoad and Kasten, Yoni
and Chechik, Gal
},
journal={arXiv preprint arXiv:2412.07750},
year = {2024},
}