Merge pull request #676 from valentasn/openai_speech

Added OpenAI Text to speech API support

Merge pull request #676 from valentasn/openai_speech
Added OpenAI Text to speech API support
f72086a1 · Wolle · GitHub · 60ee9113 · 51a1dc8a · f72086a1
Unverified Commit f72086a1 authored Feb 13, 2024 by Wolle Committed by GitHub Feb 13, 2024
4 changed files
--- a/examples/OpenAI_Speech/readme.md
+++ b/examples/OpenAI_Speech/readme.md
+# platformio.ini - example for: https://www.seeedstudio.com/XIAO-ESP32S3-p-5627.html
+
+[env:seeed_xiao_esp32s3]
+platform = espressif32
+board = seeed_xiao_esp32s3
+framework = arduino
+monitor_speed = 115200
+build_flags = 
+	-Wall
+	-Wextra
+	-DCORE_DEBUG_LEVEL=3
+	-DBOARD_HAS_PSRAM
+    -DAUDIO_LOG
+	-DARDUINO_RUNNING_CORE=1       ; Arduino Runs On Core (setup, loop)
+	-DARDUINO_EVENT_RUNNING_CORE=1 ; Events Run On Core
+lib_deps = 
+	https://github.com/schreibfaul1/ESP32-audioI2S.git#f2f1f5bcce74523dfc59e5844ba5878ed69c040a
+
+
+# main.cpp - using xTask example:
+
+``
+#include <Arduino.h>
+#include "SPI.h"
+#include <WiFi.h>
+#include <WiFiMulti.h>
+#include "Audio.h"
+
+// WiFi credentials
+#define WIFI_SSID "<YOUR_WIFI_SSID>"
+#define PASSWORD "<YOUR_WIFI_PASSWORD>"
+#define OPENAI_API_KEY "<YOUR_OPENAI_API_KEY>"
+
+// Configure I2S pins
+#define I2S_LRC D1
+#define I2S_DOUT D2
+#define I2S_BCLK D3
+#define I2S_MCLK 0
+
+// Vars
+bool isWIFIConnected;
+
+String result = "Added OpenAI Text to speech API support";
+
+// Inits
+WiFiMulti wifiMulti;
+TaskHandle_t playaudio_handle;
+QueueHandle_t audioQueue;
+Audio audio;
+
+// Declaration
+void audio_info(const char *info);
+void wifiConnect(void *pvParameters);
+void playaudio(void *pvParameters);
+
+// Default
+void setup() {
+    Serial.begin(115200);
+    isWIFIConnected = false;
+
+    // Create queue
+    audioQueue = xQueueCreate(1, sizeof(int));
+    if (audioQueue == NULL) {
+        Serial.println("Failed to create audioQueue");
+        while(1);
+    }
+
+    // Create tasks
+    xTaskCreate(wifiConnect, "wifi_Connect", 4096, NULL, 0, NULL);
+    delay(500);
+    xTaskCreate(playaudio, "playaudio", 1024 * 8, NULL, 3, &playaudio_handle);
+}
+
+void loop(void) {
+    audio.loop();
+}
+
+void audio_info(const char *info) {
+    Serial.print("audio_info: ");
+    Serial.println(info);
+}
+
+void wifiConnect(void *pvParameters) {
+    while(1) {
+        if (!isWIFIConnected) {
+            wifiMulti.addAP(WIFI_SSID, PASSWORD);
+            Serial.println("Connecting to WiFi...");
+            while (wifiMulti.run() != WL_CONNECTED) {
+                vTaskDelay(500);
+            }
+            Serial.print("Connected to WiFi\nIP: ");
+            Serial.println(WiFi.localIP());
+            isWIFIConnected = true;
+
+            Serial.println("Sending result...");
+            int eventMessage;
+            if (xQueueSend(audioQueue, &eventMessage, 0) != pdPASS) {
+                Serial.println("Failed to send result to queue");
+            }
+        } else {
+            vTaskDelay(1000 / portTICK_PERIOD_MS);
+        }
+    }
+}
+
+void playaudio(void *pvParameters) {
+    while(1) {
+        if (isWIFIConnected && audioQueue != 0) {
+            int eventMessage;
+            Serial.println("Waiting for result...");
+            if (xQueueReceive(audioQueue, &eventMessage, portMAX_DELAY) == pdPASS) {
+                Serial.print("Received result: ");
+                Serial.println(result);
+
+                // Speech
+                audio.setPinout(I2S_BCLK, I2S_LRC, I2S_DOUT, -1);
+                audio.setVolume(15); // 0...21
+                audio.openai_speech(OPENAI_API_KEY, "tts-1", result, "shimmer", "mp3", "1");
+            }
+        } else {
+            vTaskDelay(1000 / portTICK_PERIOD_MS);
+        }
+    }
+}
+``
+---
+
+# console output example:
+
+--- Terminal on /dev/ttyACM0 | 115200 8-N-1
+--- Available filters and text transformations: colorize, debug, default, direct, esp32_exception_decoder, hexlify, log2file, nocontrol, printable, send_on_enter, time
+--- More details at https://bit.ly/pio-monitor-filters
+--- Quit: Ctrl+C | Menu: Ctrl+T | Help: Ctrl+T followed by Ctrl+H
+[  3911][I][WiFiMulti.cpp:114] run(): [WIFI] scan done
+[  3911][I][WiFiMulti.cpp:119] run(): [WIFI] 15 networks found
+[  3911][I][WiFiMulti.cpp:160] run(): [WIFI] Connecting BSSID: 26:AD:69:C2:AB:E8 SSID: OpwnSS Channel: 11 (-38)
+[  4000][I][WiFiMulti.cpp:174] run(): [WIFI] Connecting done.
+Connected to WiFi
+IP: 192.168.86.23
+Sending result...
+Waiting for result...
+Received result: Added OpenAI Text to speech API support
+audio_info: Connect to new host: "api.openai.com"
+audio_info: PSRAM found, inputBufferSize: 638965 bytes
+[  4781][I][Audio.cpp:5248] ts_parsePacket(): parseTS reset
+audio_info: buffers freed, free Heap: 241976 bytes
+audio_info: connect to api.openai.com on port 443 path /v1/audio/speech
+audio_info: SSL has been established in 1108 ms, free Heap: 200804 bytes
+Waiting for result...
+[  6707][I][Audio.cpp:3949] parseContentType(): ContentType audio/mpeg, format is mp3
+audio_info: MP3Decoder has been initialized, free Heap: 201136 bytes , free stack 5648 DWORDs
+[  6711][I][Audio.cpp:3795] parseHttpResponseHeader(): Switch to DATA, metaint is 0
+audio_info: stream ready
+audio_info: syncword found at pos 0
+audio_info: Channels: 1
+audio_info: SampleRate: 24000
+audio_info: BitsPerSample: 16
+audio_info: BitRate: 160000
+audio_info: slow stream, dropouts are possible
+audio_info: slow stream, dropouts are possible
+audio_info: End of Stream.
\ No newline at end of file
--- a/examples/OpenAI_Speech/src/main.cpp
+++ b/examples/OpenAI_Speech/src/main.cpp
+#include <Arduino.h>
+#include "SPI.h"
+#include <WiFi.h>
+#include <WiFiMulti.h>
+#include "Audio.h"
+
+// WiFi credentials
+#define WIFI_SSID "<YOUR_WIFI_SSID>"
+#define PASSWORD "<YOUR_WIFI_PASSWORD>"
+#define OPENAI_API_KEY "<YOUR_OPENAI_API_KEY>" // https://platform.openai.com/api-keys
+
+// Configure I2S pins
+#define I2S_LRC D1
+#define I2S_DOUT D2
+#define I2S_BCLK D3
+#define I2S_MCLK 0
+
+// Inits
+WiFiMulti wifiMulti;
+Audio audio;
+
+// Declaration
+void audio_info(const char *info);
+
+// Default
+void setup() {
+    Serial.begin(115200);
+    
+    // Wifi
+    wifiMulti.addAP(WIFI_SSID, PASSWORD);
+    Serial.println("Connecting to WiFi...");
+    while (wifiMulti.run() != WL_CONNECTED) {
+        delay(500);
+    }
+    Serial.print("Connected to WiFi\nIP: ");
+    Serial.println(WiFi.localIP());
+
+    delay(500);
+
+    // Speech
+    audio.setPinout(I2S_BCLK, I2S_LRC, I2S_DOUT, -1);
+    audio.setVolume(15); // 0...21
+    audio.openai_speech(OPENAI_API_KEY, "tts-1", "Added OpenAI Text to speech API support", "shimmer", "mp3", "1");
+}
+
+void loop(void) {
+    audio.loop();
+}
+
+void audio_info(const char *info) {
+    Serial.print("audio_info: ");
+    Serial.println(info);
+}
\ No newline at end of file
--- a/src/Audio.cpp
+++ b/src/Audio.cpp
@@ -371,6 +371,102 @@ void Audio::setConnectionTimeout(uint16_t timeout_ms, uint16_t timeout_ms_ssl) {
    if(timeout_ms_ssl) m_timeout_ms_ssl = timeout_ms_ssl;
 }

+/* 
+    Text to speech API provides a speech endpoint based on our TTS (text-to-speech) model.
+    More info: https://platform.openai.com/docs/guides/text-to-speech/text-to-speech
+
+    Request body:
+    model (string) [Required] - One of the available TTS models: tts-1 or tts-1-hd
+    input (string) [Required] - The text to generate audio for. The maximum length is 4096 characters.
+    voice (string) [Required] - The voice to use when generating the audio. Supported voices are alloy, echo, fable, onyx, nova, and shimmer.
+    response_format (string) [Optional] - Defaults to mp3. The format to audio in. Supported formats are mp3, opus, aac, and flac.
+    speed (number) [Optional] - Defaults to 1. The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
+
+    Usage: audio.openai_speech(OPENAI_API_KEY, "tts-1", input, "shimmer", "mp3", "1");
+*/
+bool Audio::openai_speech(const String& api_key, const String& model, const String& input, const String& voice, const String& response_format, const String& speed) {
+    char host[] = "api.openai.com";
+    char path[] = "/v1/audio/speech";
+
+    xSemaphoreTakeRecursive(mutex_audio, portMAX_DELAY);
+
+    if (input == "") {
+        AUDIO_INFO("input text is empty");
+        stopSong();
+        xSemaphoreGiveRecursive(mutex_audio);
+        return false;
+    }
+
+    AUDIO_INFO("Connect to new host: \"%s\"", host);
+    setDefaults();
+    m_f_ssl = true;
+
+    String input_clean = "";
+    for (int i = 0; i < input.length(); i++) {
+        char c = input.charAt(i);
+        if (c == '\"') {
+            input_clean += "\\\"";
+        } else if (c == '\n') {
+            input_clean += "\\n";
+        } else {
+            input_clean += c;
+        }
+    }
+
+    String post_body = "{"
+        "\"model\": \"" + model + "\"," +
+        "\"input\": \"" + input_clean + "\"," +
+        "\"voice\": \"" + voice + "\"," +
+        "\"response_format\": \"" + response_format + "\"," +
+        "\"speed\": \"" + speed + "\"" +
+    "}";
+
+    String http_request = 
+        "POST " + String(path) + " HTTP/1.0\r\n" // UNKNOWN ERROR CODE (0050) - crashing on HTTP/1.1 need to use HTTP/1.0
+        + "Host: " + String(host) + "\r\n"
+        + "Authorization: Bearer " + api_key + "\r\n"
+        + "Accept-Encoding: identity;q=1,*;q=0\r\n"
+        + "User-Agent: nArija/1.0\r\n" 
+        + "Content-Type: application/json; charset=utf-8\r\n" 
+        + "Content-Length: " + post_body.length() + "\r\n" 
+        + "Connection: keep-alive\r\n" + "\r\n" 
+        + post_body + "\r\n"
+    ;
+
+    bool res = true;
+    int port = 443;
+    _client = static_cast<WiFiClient*>(&clientsecure);
+
+    uint32_t t = millis();
+    if (m_f_Log) AUDIO_INFO("connect to %s on port %d path %s", host, port, path);
+    res = _client->connect(host, port, m_timeout_ms_ssl);
+    if (res) {
+        uint32_t dt = millis() - t;
+        strcpy(m_lastHost, host);
+        AUDIO_INFO("%s has been established in %lu ms, free Heap: %lu bytes", "SSL", (long unsigned int) dt, (long unsigned int) ESP.getFreeHeap());
+        m_f_running = true;
+    }
+
+    m_expectedCodec = CODEC_NONE;
+    m_expectedPlsFmt = FORMAT_NONE;
+
+    if (res) {
+        _client->print(http_request);
+        if (response_format == "mp3") m_expectedCodec  = CODEC_MP3;
+        if (response_format == "opus") m_expectedCodec  = CODEC_OPUS;
+        if (response_format == "aac") m_expectedCodec  = CODEC_AAC;
+        if (response_format == "flac") m_expectedCodec  = CODEC_FLAC;
+        setDatamode(HTTP_RESPONSE_HEADER);
+        m_streamType = ST_WEBSTREAM;
+    } else {
+        AUDIO_INFO("Request %s failed!", host);
+        m_lastHost[0] = 0;
+    }
+
+    xSemaphoreGiveRecursive(mutex_audio);
+    return res;
+}
+
 //------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 bool Audio::connecttohost(const char* host, const char* user, const char* pwd) {
    // user and pwd for authentification only, can be empty
@@ -509,6 +605,7 @@ bool Audio::connecttohost(const char* host, const char* user, const char* pwd) {
    m_expectedPlsFmt = FORMAT_NONE;

    if(res) {
+        log_i("connecttohost(): %s", rqh);
        _client->print(rqh);
        if(endsWith(extension, ".mp3" )) m_expectedCodec  = CODEC_MP3;
        if(endsWith(extension, ".aac" )) m_expectedCodec  = CODEC_AAC;
@@ -3576,7 +3673,7 @@ bool Audio::parseHttpResponseHeader() { // this is the response to a GET / reque
            continue;
        }

-        // log_i("httpResponseHeader: %s", rhl);
+        //log_i("httpResponseHeader: %s", rhl);

        int16_t posColon = indexOf(rhl, ":", 0); // lowercase all letters up to the colon
        if(posColon >= 0) {
@@ -3650,16 +3747,6 @@ bool Audio::parseHttpResponseHeader() { // this is the response to a GET / reque
            AUDIO_INFO("Filename is %s", rhl + pos1);
        }

-        // if(startsWith(rhl, "set-cookie:")         ||
-        //         startsWith(rhl, "pragma:")        ||
-        //         startsWith(rhl, "expires:")       ||
-        //         startsWith(rhl, "cache-control:") ||
-        //         startsWith(rhl, "icy-pub:")       ||
-        //         startsWith(rhl, "p3p:")           ||
-        //         startsWith(rhl, "accept-ranges:") ){
-        //     ; // do nothing
-        // }
-
        else if(startsWith(rhl, "connection:")) {
            if(indexOf(rhl, "close", 0) >= 0) { ; /* do nothing */ }
        }
@@ -3721,7 +3808,7 @@ bool Audio::parseHttpResponseHeader() { // this is the response to a GET / reque
            if(audio_icydescription) audio_icydescription(c_idesc);
        }

-        else if((startsWith(rhl, "transfer-encoding:"))) {
+        else if(startsWith(rhl, "transfer-encoding:")) {
            if(endsWith(rhl, "chunked") || endsWith(rhl, "Chunked")) { // Station provides chunked transfer
                m_f_chunked = true;
                if(m_f_Log) AUDIO_INFO("chunked data transfer");
@@ -5634,8 +5721,14 @@ boolean Audio::streamDetection(uint32_t bytesAvail) {
        tmr_lost = millis() + 1000;
        if(cnt_lost == 5) { // 5s no data?
            cnt_lost = 0;
-            AUDIO_INFO("Stream lost -> try new connection");
-            connecttohost(m_lastHost);
+            if (String(m_lastHost) == "api.openai.com") {
+                AUDIO_INFO("End of Stream.");
+                m_f_running = false;
+                setDatamode(AUDIO_NONE);
+            } else {
+                AUDIO_INFO("Stream lost -> try new connection");
+                connecttohost(m_lastHost);
+            }
            return true;
        }
    }

--- a/src/Audio.h
+++ b/src/Audio.h
@@ -128,6 +128,7 @@ public:
    Audio(bool internalDAC = false, uint8_t channelEnabled = 3, uint8_t i2sPort = I2S_NUM_0); // #99
    ~Audio();
    void setBufsize(int rambuf_sz, int psrambuf_sz);
+    bool openai_speech(const String& api_key, const String& model, const String& input, const String& voice, const String& response_format, const String& speed);
    bool connecttohost(const char* host, const char* user = "", const char* pwd = "");
    bool connecttospeech(const char* speech, const char* lang);
    bool connecttoFS(fs::FS &fs, const char* path, int32_t resumeFilePos = -1);