@article{oai:nitech.repo.nii.ac.jp:00005124, author = {Yamagishi, Junichi and Tamura, Masatsune and Masuko, Takashi and Tokuda, Keiichi and 徳田, 恵一 and Kobayashi, Takao}, issue = {3}, journal = {IEICE transactions on information and systems}, month = {Mar}, note = {This paper describes a new context clustering technique for average voice model, which is a set of speaker independent speech synthesis units. In the technique, we first train speaker dependent models using multi-speaker speech database, and then construct a decision tree common to these speaker dependent models for context clustering. When a node of the decision tree is split, only the context related questions which are applicable to all speaker dependent models are adopted. As a result, every node of the decision tree always has training data of all speakers. After construction of the decision tree, all speaker dependent models are clustered using the common decision tree and a speaker independent model, i.e., an average voice model is obtained by combining speaker dependent models. From the results of subjective tests, we show that the average voice models trained using the proposed technique can generate more natural sounding speech than the conventional average voice models., application/pdf}, pages = {534--542}, title = {A Context Clustering Technique for Average Voice Models}, volume = {E86-D}, year = {2003}, yomi = {トクダ, ケイイチ} }