Text to speech API provides a speech endpoint based on our TTS (text-to-speech) model.

53254f36 · v · f2f1f5bc · 53254f36 · 53254f36
Commit 53254f36 authored Feb 10, 2024 by v
Hide whitespace changes
Inline Side-by-side

Showing with 109 additions and 14 deletions

src/Audio.cpp src/Audio.cpp +108 -14

src/Audio.h src/Audio.h +1 -0

No files found.
--- a/src/Audio.cpp
+++ b/src/Audio.cpp
@@ -369,6 +369,103 @@ void Audio::setConnectionTimeout(uint16_t timeout_ms, uint16_t timeout_ms_ssl) {
    if(timeout_ms_ssl) m_timeout_ms_ssl = timeout_ms_ssl;
 }

+/* 
+    Text to speech API provides a speech endpoint based on our TTS (text-to-speech) model.
+    More info: https://platform.openai.com/docs/guides/text-to-speech/text-to-speech
+
+    Request body:
+    model (string) [Required] - One of the available TTS models: tts-1 or tts-1-hd
+    input (string) [Required] - The text to generate audio for. The maximum length is 4096 characters.
+    voice (string) [Required] - The voice to use when generating the audio. Supported voices are alloy, echo, fable, onyx, nova, and shimmer.
+    response_format (string) [Optional] - Defaults to mp3. The format to audio in. Supported formats are mp3, opus, aac, and flac.
+    speed (number) [Optional] - Defaults to 1. The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
+
+    Usage: audio.openai_speech(OPENAI_API_KEY, "tts-1", input, "shimmer", "mp3", "1");
+*/
+bool Audio::openai_speech(const String& api_key, const String& model, const String& input, const String& voice, const String& response_format, const String& speed) {
+    char host[] = "api.openai.com";
+    char path[] = "/v1/audio/speech";
+
+    xSemaphoreTakeRecursive(mutex_audio, portMAX_DELAY);
+
+    if (input == "") {
+        AUDIO_INFO("input text is empty");
+        stopSong();
+        xSemaphoreGiveRecursive(mutex_audio);
+        return false;
+    }
+
+    AUDIO_INFO("Connect to new host: \"%s\"", host);
+    setDefaults();
+    m_f_ssl = true;
+
+    String input_clean = "";
+    for (int i = 0; i < input.length(); i++) {
+        char c = input.charAt(i);
+        if (c == '\"') {
+            input_clean += "\\\"";
+        } else if (c == '\n') {
+            input_clean += "\\n";
+        } else {
+            input_clean += c;
+        }
+    }
+
+    String post_body = "{"
+        "\"model\": \"" + model + "\"," +
+        "\"input\": \"" + input_clean + "\"," +
+        "\"voice\": \"" + voice + "\"," +
+        "\"response_format\": \"" + response_format + "\"," +
+        "\"speed\": \"" + speed + "\"" +
+    "}";
+
+    String http_request = 
+        "POST " + String(path) + " HTTP/1.0\r\n" // UNKNOWN ERROR CODE (0050) - crashing on HTTP/1.1 need to use HTTP/1.0
+        + "Host: " + String(host) + "\r\n"
+        + "Authorization: Bearer " + api_key + "\r\n"
+        + "Accept-Encoding: identity;q=1,*;q=0\r\n"
+        + "User-Agent: nArija/1.0\r\n" 
+        + "Content-Type: application/json; charset=utf-8\r\n" 
+        + "Content-Length: " + post_body.length() + "\r\n" 
+        + "Connection: keep-alive\r\n" + "\r\n" 
+        + post_body + "\r\n"
+    ;
+
+    bool res = true;
+    int port = 443;
+    _client = static_cast<WiFiClient*>(&clientsecure);
+
+    uint32_t t = millis();
+    if (m_f_Log) AUDIO_INFO("connect to %s on port %d path %s", host, port, path);
+    res = _client->connect(host, port, m_timeout_ms_ssl);
+    if (res) {
+        uint32_t dt = millis() - t;
+        strcpy(m_lastHost, host);
+        AUDIO_INFO("%s has been established in %lu ms, free Heap: %lu bytes", "SSL", (long unsigned int) dt, (long unsigned int) ESP.getFreeHeap());
+        m_f_running = true;
+    }
+
+    m_expectedCodec = CODEC_NONE;
+    m_expectedPlsFmt = FORMAT_NONE;
+
+    if (res) {
+        log_i("connecttohost(): %s", http_request);
+        _client->print(http_request);
+        if (response_format == "mp3") m_expectedCodec  = CODEC_MP3;
+        if (response_format == "opus") m_expectedCodec  = CODEC_OPUS;
+        if (response_format == "aac") m_expectedCodec  = CODEC_AAC;
+        if (response_format == "flac") m_expectedCodec  = CODEC_FLAC;
+        setDatamode(HTTP_RESPONSE_HEADER);
+        m_streamType = ST_WEBSTREAM;
+    } else {
+        AUDIO_INFO("Request %s failed!", host);
+        m_lastHost[0] = 0;
+    }
+
+    xSemaphoreGiveRecursive(mutex_audio);
+    return res;
+}
+
 //------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 bool Audio::connecttohost(const char* host, const char* user, const char* pwd) {
    // user and pwd for authentification only, can be empty
@@ -506,6 +603,7 @@ bool Audio::connecttohost(const char* host, const char* user, const char* pwd) {
    m_expectedPlsFmt = FORMAT_NONE;

    if(res) {
+        log_i("connecttohost(): %s", rqh);
        _client->print(rqh);
        if(endsWith(extension, ".mp3" )) m_expectedCodec  = CODEC_MP3;
        if(endsWith(extension, ".aac" )) m_expectedCodec  = CODEC_AAC;
@@ -3525,7 +3623,7 @@ bool Audio::parseHttpResponseHeader() { // this is the response to a GET / reque
            continue;
        }

-        // log_i("httpResponseHeader: %s", rhl);
+        //log_i("httpResponseHeader: %s", rhl);

        int16_t posColon = indexOf(rhl, ":", 0); // lowercase all letters up to the colon
        if(posColon >= 0) {
@@ -3599,16 +3697,6 @@ bool Audio::parseHttpResponseHeader() { // this is the response to a GET / reque
            AUDIO_INFO("Filename is %s", rhl + pos1);
        }

-        // if(startsWith(rhl, "set-cookie:")         ||
-        //         startsWith(rhl, "pragma:")        ||
-        //         startsWith(rhl, "expires:")       ||
-        //         startsWith(rhl, "cache-control:") ||
-        //         startsWith(rhl, "icy-pub:")       ||
-        //         startsWith(rhl, "p3p:")           ||
-        //         startsWith(rhl, "accept-ranges:") ){
-        //     ; // do nothing
-        // }
-
        else if(startsWith(rhl, "connection:")) {
            if(indexOf(rhl, "close", 0) >= 0) { ; /* do nothing */ }
        }
@@ -3670,7 +3758,7 @@ bool Audio::parseHttpResponseHeader() { // this is the response to a GET / reque
            if(audio_icydescription) audio_icydescription(c_idesc);
        }

-        else if((startsWith(rhl, "transfer-encoding:"))) {
+        else if(startsWith(rhl, "transfer-encoding:")) {
            if(endsWith(rhl, "chunked") || endsWith(rhl, "Chunked")) { // Station provides chunked transfer
                m_f_chunked = true;
                if(m_f_Log) AUDIO_INFO("chunked data transfer");
@@ -5551,8 +5639,14 @@ boolean Audio::streamDetection(uint32_t bytesAvail) {
        tmr_lost = millis() + 1000;
        if(cnt_lost == 5) { // 5s no data?
            cnt_lost = 0;
-            AUDIO_INFO("Stream lost -> try new connection");
-            connecttohost(m_lastHost);
+            if (String(m_lastHost) == "api.openai.com") {
+                AUDIO_INFO("End of Stream.");
+                m_f_running = false;
+                setDatamode(AUDIO_NONE);
+            } else {
+                AUDIO_INFO("Stream lost -> try new connection");
+                connecttohost(m_lastHost);
+            }
            return true;
        }
    }

--- a/src/Audio.h
+++ b/src/Audio.h
@@ -127,6 +127,7 @@ public:
    Audio(bool internalDAC = false, uint8_t channelEnabled = 3, uint8_t i2sPort = I2S_NUM_0); // #99
    ~Audio();
    void setBufsize(int rambuf_sz, int psrambuf_sz);
+    bool openai_speech(const String& api_key, const String& model, const String& input, const String& voice, const String& response_format, const String& speed);
    bool connecttohost(const char* host, const char* user = "", const char* pwd = "");
    bool connecttospeech(const char* speech, const char* lang);
    bool connecttoFS(fs::FS &fs, const char* path, int32_t resumeFilePos = -1);