@inproceedings{6ff1806190564f7f9a034ec53a7d74f8,
title = "SCOT: Self-Supervised Contrastive Pretraining for Zero-Shot Compositional Retrieval",
abstract = "Compositional image retrieval (CIR) is a multimodal learning task where a model combines a query image with a user-provided text modification to retrieve a target image. CIR finds applications in a variety of domains including product retrieval (e-commerce) and web search. Existing methods primarily focus on fully-supervised learning, wherein models are trained on datasets of labeled triplets such as FashionIQ and CIRR. This poses two significant challenges: (i) curating such triplet datasets is labor intensive; and (ii) models lack generalization to un-seen objects and domains. In this work, we propose SCOT (Self-supervised COmpositional Training), a novel zero-shot compositional pretraining strategy that combines existing large image-text pair datasets with the generative capabilities of large language models to contrastively train an embedding composition network. Specifically, we show that the text embedding from a large-scale contrastively-pretrained vision-language model can be utilized as proxy target supervision during compositional pretraining, replacing the target image embedding. In zero-shot settings, this strategy surpasses SOTA zero-shot compositional re-trieval methods as well as many fully-supervised methods on standard benchmarks such as FashionIQ and CIRR. Our code and models are available at https://github.com/yahoo/SCOT.",
keywords = "compostional, computer-vision, language, llm, retrieval, self-supervised, vision, zero-shot",
author = "Bhavin Jawade and Soares, \{Jo{\~a}o V.B.\} and Kapil Thadani and Mohan, \{Deen Dayal\} and Eshratifar, \{Amir Erfan\} and Benjamin Culpepper and \{De Juan\}, Paloma and Srirangaraj Setlur and Venu Govindaraju",
note = "Publisher Copyright: {\textcopyright} 2025 IEEE.; 2025 IEEE/CVF Winter Conference on Applications of Computer Vision, WACV 2025 ; Conference date: 28-02-2025 Through 04-03-2025",
year = "2025",
doi = "10.1109/WACV61041.2025.00538",
language = "English",
series = "Proceedings - 2025 IEEE Winter Conference on Applications of Computer Vision, WACV 2025",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "5509--5519",
booktitle = "Proceedings - 2025 IEEE Winter Conference on Applications of Computer Vision, WACV 2025",
address = "United States",
}