@inproceedings{84f5a768e5174fdfa758363aeb236c68,
title = "Cute: A concatenative method for voice conversion using exemplar-based unit selection",
abstract = "State-of-the art voice conversion methods re-synthesize voice from spectral representations such as MFCCs and STRAIGHT, thereby introducing muffled artifacts. We propose a method that circumvents this concern using concatenative synthesis coupled with exemplar-based unit selection. Given parallel speech from source and target speakers as well as a new query from the source, our method stitches together pieces of the target voice. It optimizes for three goals: matching the query, using long consecutive segments, and smooth transitions between the segments. To achieve these goals, we perform unit selection at the frame level and introduce triphone-based preselection that greatly reduces computation and enforces selection of long, contiguous pieces. Our experiments show that the proposed method has better quality than baseline methods, while preserving high individuality.",
keywords = "Voice conversion, concatenative synthesis, exemplar-based, unit selection",
author = "Zeyu Jin and Adam Finkelstein and Stephen Diverdi and Jingwan Lu and Mysore, {Gautham J.}",
note = "Publisher Copyright: {\textcopyright} 2016 IEEE.; 41st IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2016 ; Conference date: 20-03-2016 Through 25-03-2016",
year = "2016",
month = may,
day = "18",
doi = "10.1109/ICASSP.2016.7472761",
language = "English (US)",
series = "ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "5660--5664",
booktitle = "2016 IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2016 - Proceedings",
address = "United States",
}