Mastodon Icon GitHub Icon LinkedIn Icon RSS Icon

How to use ElevenLabs text-to-speech in Unity

A simple script to add ElevenLabs text-to-speech to your Unity game.

Calculon from Futurama sipping cognac or whatever.

As you probably know, I work for a company developing conversational UIs. During the last few months, we had several demos of interactive applications and games focused on user interaction with virtual avatars. Text-to-Speech (TTS) is a fundamental aspect of such applications: nothing destroys the illusion of a virtual character more than a robotic and artificial voice.

For this reason, I ended up trying several text-to-speech services, looking for the perfect one. I had two requirements: 1) I wanted a realistic voice, and 2) I didn’t want to mess with manually tagging and tuning the vocal emotions of a text (in fact, it is hard to do if part of the voiced text is itself mildly-generated by AI).

Luckily, we live in a booming age of AI services, so I had many options to choose from (and new services are popping up every day).

At the moment I am writing (August 2023), I concluded that the best TTS service is the one provided by ElevenLabs. It is a bit expensive, but the voices are very natural, and it is easy to get high-quality results by just typing the text. The only problem is that ElevenLabs does not offer a Unity SDK, so I had to write a simple script for my Unity projects.

In the spirit of collaboration, I decided to share the basic version of my script. It is not perfect and could use some stronger input sanitization, but it works and is a good starting point if you want to add ElevenLabs TTS to your Unity project.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
using System;
using System.Collections;
using System.Text;
using Newtonsoft.Json;
using UnityEngine;
using UnityEngine.Events;
using UnityEngine.Networking;

public class ElevenlabsAPI : MonoBehaviour {
    [SerializeField]
    private string _voiceId;
    [SerializeField]
    private string _apiKey;
    [SerializeField]
    private string _apiUrl = "https://api.elevenlabs.io";
    
    private AudioClip _audioClip;

    // If true, the audio will be streamed instead of downloaded
    // Unfortunately, Unity has some problems with streaming audio
    // but I left this option here in case you want to try it.
    public bool Streaming;

    [Range(0, 4)]
    public int LatencyOptimization;

    // This event is used to broadcast the received AudioClip
    public UnityEvent<AudioClip> AudioReceived;

    public ElevenlabsAPI(string apiKey, string voiceId) {
        _apiKey = apiKey;
        _voiceId = voiceId;
    }

    public void GetAudio(string text) {
        StartCoroutine(DoRequest(text));
    }

    IEnumerator DoRequest(string message) {
        var postData = new TextToSpeechRequest {
            text = message,
            model_id = "eleven_monolingual_v1"
        };

        // TODO: This could be easily exposed in the Unity inspector,
        // but I had no use for it in my work demo.
        var voiceSetting = new VoiceSettings {
            stability = 0,
            similarity_boost = 0,
            style = 0.5f,
            use_speaker_boost = true
        };
        postData.voice_settings = voiceSetting;
        var json = JsonConvert.SerializeObject(postData);
        var uH = new UploadHandlerRaw(Encoding.ASCII.GetBytes(json));
        var stream = (Streaming) ? "/stream" : "";
        var url = $"{_apiUrl}/v1/text-to-speech/{_voiceId}{stream}?optimize_streaming_latency={LatencyOptimization}";
        var request = UnityWebRequest.Post(url, json);
        var downloadHandler = new DownloadHandlerAudioClip(url, AudioType.MPEG);
        if (Streaming) {
            downloadHandler.streamAudio = true;
        }
        request.uploadHandler = uH;
        request.downloadHandler = downloadHandler;
        request.SetRequestHeader("Content-Type", "application/json");
        request.SetRequestHeader("xi-api-key", _apiKey);
        request.SetRequestHeader("Accept", "audio/mpeg");
        yield return request.SendWebRequest();

        if (request.result != UnityWebRequest.Result.Success) {
            Debug.LogError("Error downloading audio: " + request.error);
            yield break;
        }
        AudioClip audioClip = downloadHandler.audioClip;
        AudioReceived.Invoke(audioClip);
        request.Dispose();
    }

    [Serializable]
    public class TextToSpeechRequest {
        public string text;
        public string model_id; // eleven_monolingual_v1
        public VoiceSettings voice_settings;
    }

    [Serializable]
    public class VoiceSettings {
        public int stability; // 0
        public int similarity_boost; // 0
        public float style; // 0.5
        public bool use_speaker_boost; // true
    }
}
comments powered by Disqus