@inproceedings{9e7dde91e18a4491a6ad4ea0cb11e5c2,
title = "GR0: SELF-SUPERVISED GLOBAL REPRESENTATION LEARNING FOR ZERO-SHOT VOICE CONVERSION",
abstract = "Research in generative self-supervised learning (SSL) has largely focused on local embeddings for tokenized sequences. We introduce a generative SSL framework that learns a global representation that is disentangled from local embeddings. We apply this technique to jointly learn a global speaker embedding and a zero-shot voice converter. The converter modifies recorded speech to sound as if it were spoken by a different person while preserving the content, using only a short reference clip unavailable to the model during training. Listening experiments conducted on an unseen dataset show that our models significantly outperform SOTA baselines in both quality and speaker similarity for various datasets and unseen languages.",
keywords = "cross-lingual zero-shot voice conversion, generative self-supervised global representation learning",
author = "Yunyun Wang and Jiaqi Su and Adam Finkelstein and Zeyu Jin",
note = "Publisher Copyright: {\textcopyright} 2024 IEEE.; 49th IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2024 ; Conference date: 14-04-2024 Through 19-04-2024",
year = "2024",
doi = "10.1109/ICASSP48485.2024.10448232",
language = "English (US)",
series = "ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "10786--10790",
booktitle = "2024 IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2024 - Proceedings",
address = "United States",
}