@inproceedings{0d69ab195d55412b8fb63dec42dbc942,
title = "SBAT: Video captioning with sparse boundary-aware transformer",
abstract = "In this paper, we focus on the problem of applying the transformer structure to video captioning effectively. The vanilla transformer is proposed for uni-modal language generation task such as machine translation. However, video captioning is a multimodal learning problem, and the video features have much redundancy between different time steps. Based on these concerns, we propose a novel method called sparse boundary-aware transformer (SBAT) to reduce the redundancy in video representation. SBAT employs boundary-aware pooling operation for scores from multihead attention and selects diverse features from different scenarios. Also, SBAT includes a local correlation scheme to compensate for the local information loss brought by sparse operation. Based on SBAT, we further propose an aligned cross-modal encoding scheme to boost the multimodal interaction. Experimental results on two benchmark datasets show that SBAT outperforms the state-of-the-art methods under most of the metrics.",
author = "Tao Jin and Siyu Huang and Ming Chen and Yingming Li and Zhongfei Zhang",
note = "Publisher Copyright: {\textcopyright} 2020 Inst. Sci. inf., Univ. Defence in Belgrade. All rights reserved.; 29th International Joint Conference on Artificial Intelligence, IJCAI 2020 ; Conference date: 01-01-2021",
year = "2020",
language = "English",
series = "IJCAI International Joint Conference on Artificial Intelligence",
publisher = "International Joint Conferences on Artificial Intelligence",
pages = "630--636",
editor = "Christian Bessiere",
booktitle = "Proceedings of the 29th International Joint Conference on Artificial Intelligence, IJCAI 2020",
}