@inproceedings{95a82ad962554335a863deba0e1efc4c,
title = "Multi-query Video Retrieval",
abstract = "Retrieving target videos based on text descriptions is a task of great practical value and has received increasing attention over the past few years. Despite recent progress, imperfect annotations in existing video retrieval datasets have posed significant challenges on model evaluation and development. In this paper, we tackle this issue by focusing on the less-studied setting of multi-query video retrieval, where multiple descriptions are provided to the model for searching over the video archive. We first show that multi-query retrieval task effectively mitigates the dataset noise introduced by imperfect annotations and better correlates with human judgement on evaluating retrieval abilities of current models. We then investigate several methods which leverage multiple queries at training time, and demonstrate that the multi-query inspired training can lead to superior performance and better generalization. We hope further investigation in this direction can bring new insights on building systems that perform better in real-world video retrieval applications (Code is available at https://github.com/princetonvisualai/MQVR ).",
keywords = "Evaluation, Multi-query, Video retrieval",
author = "Zeyu Wang and Yu Wu and Karthik Narasimhan and Olga Russakovsky",
note = "Funding Information: This material is based upon work supported by the National Science Foundation under Grant No. 2107048. Any opinions, findings, and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the National Science Foundation. We would like to thank members of the Princeton Visual AI Lab (Jihoon Chung, Zhiwei Deng, William Yang and others) for their helpful comments and suggestions. Funding Information: Acknowledgements. This material is based upon work supported by the National Science Foundation under Grant No. 2107048. Any opinions, findings, and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the National Science Foundation. We would like to thank members of the Princeton Visual AI Lab (Jihoon Chung, Zhiwei Deng, William Yang and others) for their helpful comments and suggestions. Publisher Copyright: {\textcopyright} 2022, The Author(s), under exclusive license to Springer Nature Switzerland AG.; 17th European Conference on Computer Vision, ECCV 2022 ; Conference date: 23-10-2022 Through 27-10-2022",
year = "2022",
doi = "10.1007/978-3-031-19781-9_14",
language = "English (US)",
isbn = "9783031197802",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "233--249",
editor = "Shai Avidan and Gabriel Brostow and Moustapha Ciss{\'e} and Farinella, {Giovanni Maria} and Tal Hassner",
booktitle = "Computer Vision – ECCV 2022 - 17th European Conference, Proceedings",
address = "Germany",
}