Unverified Commit f72086a1 authored by Wolle's avatar Wolle Committed by GitHub

Merge pull request #676 from valentasn/openai_speech

Added OpenAI Text to speech API support
parents 60ee9113 51a1dc8a
# platformio.ini - example for: https://www.seeedstudio.com/XIAO-ESP32S3-p-5627.html
[env:seeed_xiao_esp32s3]
platform = espressif32
board = seeed_xiao_esp32s3
framework = arduino
monitor_speed = 115200
build_flags =
-Wall
-Wextra
-DCORE_DEBUG_LEVEL=3
-DBOARD_HAS_PSRAM
-DAUDIO_LOG
-DARDUINO_RUNNING_CORE=1 ; Arduino Runs On Core (setup, loop)
-DARDUINO_EVENT_RUNNING_CORE=1 ; Events Run On Core
lib_deps =
https://github.com/schreibfaul1/ESP32-audioI2S.git#f2f1f5bcce74523dfc59e5844ba5878ed69c040a
# main.cpp - using xTask example:
``
#include <Arduino.h>
#include "SPI.h"
#include <WiFi.h>
#include <WiFiMulti.h>
#include "Audio.h"
// WiFi credentials
#define WIFI_SSID "<YOUR_WIFI_SSID>"
#define PASSWORD "<YOUR_WIFI_PASSWORD>"
#define OPENAI_API_KEY "<YOUR_OPENAI_API_KEY>"
// Configure I2S pins
#define I2S_LRC D1
#define I2S_DOUT D2
#define I2S_BCLK D3
#define I2S_MCLK 0
// Vars
bool isWIFIConnected;
String result = "Added OpenAI Text to speech API support";
// Inits
WiFiMulti wifiMulti;
TaskHandle_t playaudio_handle;
QueueHandle_t audioQueue;
Audio audio;
// Declaration
void audio_info(const char *info);
void wifiConnect(void *pvParameters);
void playaudio(void *pvParameters);
// Default
void setup() {
Serial.begin(115200);
isWIFIConnected = false;
// Create queue
audioQueue = xQueueCreate(1, sizeof(int));
if (audioQueue == NULL) {
Serial.println("Failed to create audioQueue");
while(1);
}
// Create tasks
xTaskCreate(wifiConnect, "wifi_Connect", 4096, NULL, 0, NULL);
delay(500);
xTaskCreate(playaudio, "playaudio", 1024 * 8, NULL, 3, &playaudio_handle);
}
void loop(void) {
audio.loop();
}
void audio_info(const char *info) {
Serial.print("audio_info: ");
Serial.println(info);
}
void wifiConnect(void *pvParameters) {
while(1) {
if (!isWIFIConnected) {
wifiMulti.addAP(WIFI_SSID, PASSWORD);
Serial.println("Connecting to WiFi...");
while (wifiMulti.run() != WL_CONNECTED) {
vTaskDelay(500);
}
Serial.print("Connected to WiFi\nIP: ");
Serial.println(WiFi.localIP());
isWIFIConnected = true;
Serial.println("Sending result...");
int eventMessage;
if (xQueueSend(audioQueue, &eventMessage, 0) != pdPASS) {
Serial.println("Failed to send result to queue");
}
} else {
vTaskDelay(1000 / portTICK_PERIOD_MS);
}
}
}
void playaudio(void *pvParameters) {
while(1) {
if (isWIFIConnected && audioQueue != 0) {
int eventMessage;
Serial.println("Waiting for result...");
if (xQueueReceive(audioQueue, &eventMessage, portMAX_DELAY) == pdPASS) {
Serial.print("Received result: ");
Serial.println(result);
// Speech
audio.setPinout(I2S_BCLK, I2S_LRC, I2S_DOUT, -1);
audio.setVolume(15); // 0...21
audio.openai_speech(OPENAI_API_KEY, "tts-1", result, "shimmer", "mp3", "1");
}
} else {
vTaskDelay(1000 / portTICK_PERIOD_MS);
}
}
}
``
---
# console output example:
--- Terminal on /dev/ttyACM0 | 115200 8-N-1
--- Available filters and text transformations: colorize, debug, default, direct, esp32_exception_decoder, hexlify, log2file, nocontrol, printable, send_on_enter, time
--- More details at https://bit.ly/pio-monitor-filters
--- Quit: Ctrl+C | Menu: Ctrl+T | Help: Ctrl+T followed by Ctrl+H
[ 3911][I][WiFiMulti.cpp:114] run(): [WIFI] scan done
[ 3911][I][WiFiMulti.cpp:119] run(): [WIFI] 15 networks found
[ 3911][I][WiFiMulti.cpp:160] run(): [WIFI] Connecting BSSID: 26:AD:69:C2:AB:E8 SSID: OpwnSS Channel: 11 (-38)
[ 4000][I][WiFiMulti.cpp:174] run(): [WIFI] Connecting done.
Connected to WiFi
IP: 192.168.86.23
Sending result...
Waiting for result...
Received result: Added OpenAI Text to speech API support
audio_info: Connect to new host: "api.openai.com"
audio_info: PSRAM found, inputBufferSize: 638965 bytes
[ 4781][I][Audio.cpp:5248] ts_parsePacket(): parseTS reset
audio_info: buffers freed, free Heap: 241976 bytes
audio_info: connect to api.openai.com on port 443 path /v1/audio/speech
audio_info: SSL has been established in 1108 ms, free Heap: 200804 bytes
Waiting for result...
[ 6707][I][Audio.cpp:3949] parseContentType(): ContentType audio/mpeg, format is mp3
audio_info: MP3Decoder has been initialized, free Heap: 201136 bytes , free stack 5648 DWORDs
[ 6711][I][Audio.cpp:3795] parseHttpResponseHeader(): Switch to DATA, metaint is 0
audio_info: stream ready
audio_info: syncword found at pos 0
audio_info: Channels: 1
audio_info: SampleRate: 24000
audio_info: BitsPerSample: 16
audio_info: BitRate: 160000
audio_info: slow stream, dropouts are possible
audio_info: slow stream, dropouts are possible
audio_info: End of Stream.
\ No newline at end of file
#include <Arduino.h>
#include "SPI.h"
#include <WiFi.h>
#include <WiFiMulti.h>
#include "Audio.h"
// WiFi credentials
#define WIFI_SSID "<YOUR_WIFI_SSID>"
#define PASSWORD "<YOUR_WIFI_PASSWORD>"
#define OPENAI_API_KEY "<YOUR_OPENAI_API_KEY>" // https://platform.openai.com/api-keys
// Configure I2S pins
#define I2S_LRC D1
#define I2S_DOUT D2
#define I2S_BCLK D3
#define I2S_MCLK 0
// Inits
WiFiMulti wifiMulti;
Audio audio;
// Declaration
void audio_info(const char *info);
// Default
void setup() {
Serial.begin(115200);
// Wifi
wifiMulti.addAP(WIFI_SSID, PASSWORD);
Serial.println("Connecting to WiFi...");
while (wifiMulti.run() != WL_CONNECTED) {
delay(500);
}
Serial.print("Connected to WiFi\nIP: ");
Serial.println(WiFi.localIP());
delay(500);
// Speech
audio.setPinout(I2S_BCLK, I2S_LRC, I2S_DOUT, -1);
audio.setVolume(15); // 0...21
audio.openai_speech(OPENAI_API_KEY, "tts-1", "Added OpenAI Text to speech API support", "shimmer", "mp3", "1");
}
void loop(void) {
audio.loop();
}
void audio_info(const char *info) {
Serial.print("audio_info: ");
Serial.println(info);
}
\ No newline at end of file
......@@ -371,6 +371,102 @@ void Audio::setConnectionTimeout(uint16_t timeout_ms, uint16_t timeout_ms_ssl) {
if(timeout_ms_ssl) m_timeout_ms_ssl = timeout_ms_ssl;
}
/*
Text to speech API provides a speech endpoint based on our TTS (text-to-speech) model.
More info: https://platform.openai.com/docs/guides/text-to-speech/text-to-speech
Request body:
model (string) [Required] - One of the available TTS models: tts-1 or tts-1-hd
input (string) [Required] - The text to generate audio for. The maximum length is 4096 characters.
voice (string) [Required] - The voice to use when generating the audio. Supported voices are alloy, echo, fable, onyx, nova, and shimmer.
response_format (string) [Optional] - Defaults to mp3. The format to audio in. Supported formats are mp3, opus, aac, and flac.
speed (number) [Optional] - Defaults to 1. The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
Usage: audio.openai_speech(OPENAI_API_KEY, "tts-1", input, "shimmer", "mp3", "1");
*/
bool Audio::openai_speech(const String& api_key, const String& model, const String& input, const String& voice, const String& response_format, const String& speed) {
char host[] = "api.openai.com";
char path[] = "/v1/audio/speech";
xSemaphoreTakeRecursive(mutex_audio, portMAX_DELAY);
if (input == "") {
AUDIO_INFO("input text is empty");
stopSong();
xSemaphoreGiveRecursive(mutex_audio);
return false;
}
AUDIO_INFO("Connect to new host: \"%s\"", host);
setDefaults();
m_f_ssl = true;
String input_clean = "";
for (int i = 0; i < input.length(); i++) {
char c = input.charAt(i);
if (c == '\"') {
input_clean += "\\\"";
} else if (c == '\n') {
input_clean += "\\n";
} else {
input_clean += c;
}
}
String post_body = "{"
"\"model\": \"" + model + "\"," +
"\"input\": \"" + input_clean + "\"," +
"\"voice\": \"" + voice + "\"," +
"\"response_format\": \"" + response_format + "\"," +
"\"speed\": \"" + speed + "\"" +
"}";
String http_request =
"POST " + String(path) + " HTTP/1.0\r\n" // UNKNOWN ERROR CODE (0050) - crashing on HTTP/1.1 need to use HTTP/1.0
+ "Host: " + String(host) + "\r\n"
+ "Authorization: Bearer " + api_key + "\r\n"
+ "Accept-Encoding: identity;q=1,*;q=0\r\n"
+ "User-Agent: nArija/1.0\r\n"
+ "Content-Type: application/json; charset=utf-8\r\n"
+ "Content-Length: " + post_body.length() + "\r\n"
+ "Connection: keep-alive\r\n" + "\r\n"
+ post_body + "\r\n"
;
bool res = true;
int port = 443;
_client = static_cast<WiFiClient*>(&clientsecure);
uint32_t t = millis();
if (m_f_Log) AUDIO_INFO("connect to %s on port %d path %s", host, port, path);
res = _client->connect(host, port, m_timeout_ms_ssl);
if (res) {
uint32_t dt = millis() - t;
strcpy(m_lastHost, host);
AUDIO_INFO("%s has been established in %lu ms, free Heap: %lu bytes", "SSL", (long unsigned int) dt, (long unsigned int) ESP.getFreeHeap());
m_f_running = true;
}
m_expectedCodec = CODEC_NONE;
m_expectedPlsFmt = FORMAT_NONE;
if (res) {
_client->print(http_request);
if (response_format == "mp3") m_expectedCodec = CODEC_MP3;
if (response_format == "opus") m_expectedCodec = CODEC_OPUS;
if (response_format == "aac") m_expectedCodec = CODEC_AAC;
if (response_format == "flac") m_expectedCodec = CODEC_FLAC;
setDatamode(HTTP_RESPONSE_HEADER);
m_streamType = ST_WEBSTREAM;
} else {
AUDIO_INFO("Request %s failed!", host);
m_lastHost[0] = 0;
}
xSemaphoreGiveRecursive(mutex_audio);
return res;
}
//------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
bool Audio::connecttohost(const char* host, const char* user, const char* pwd) {
// user and pwd for authentification only, can be empty
......@@ -509,6 +605,7 @@ bool Audio::connecttohost(const char* host, const char* user, const char* pwd) {
m_expectedPlsFmt = FORMAT_NONE;
if(res) {
log_i("connecttohost(): %s", rqh);
_client->print(rqh);
if(endsWith(extension, ".mp3" )) m_expectedCodec = CODEC_MP3;
if(endsWith(extension, ".aac" )) m_expectedCodec = CODEC_AAC;
......@@ -3576,7 +3673,7 @@ bool Audio::parseHttpResponseHeader() { // this is the response to a GET / reque
continue;
}
// log_i("httpResponseHeader: %s", rhl);
//log_i("httpResponseHeader: %s", rhl);
int16_t posColon = indexOf(rhl, ":", 0); // lowercase all letters up to the colon
if(posColon >= 0) {
......@@ -3650,16 +3747,6 @@ bool Audio::parseHttpResponseHeader() { // this is the response to a GET / reque
AUDIO_INFO("Filename is %s", rhl + pos1);
}
// if(startsWith(rhl, "set-cookie:") ||
// startsWith(rhl, "pragma:") ||
// startsWith(rhl, "expires:") ||
// startsWith(rhl, "cache-control:") ||
// startsWith(rhl, "icy-pub:") ||
// startsWith(rhl, "p3p:") ||
// startsWith(rhl, "accept-ranges:") ){
// ; // do nothing
// }
else if(startsWith(rhl, "connection:")) {
if(indexOf(rhl, "close", 0) >= 0) { ; /* do nothing */ }
}
......@@ -3721,7 +3808,7 @@ bool Audio::parseHttpResponseHeader() { // this is the response to a GET / reque
if(audio_icydescription) audio_icydescription(c_idesc);
}
else if((startsWith(rhl, "transfer-encoding:"))) {
else if(startsWith(rhl, "transfer-encoding:")) {
if(endsWith(rhl, "chunked") || endsWith(rhl, "Chunked")) { // Station provides chunked transfer
m_f_chunked = true;
if(m_f_Log) AUDIO_INFO("chunked data transfer");
......@@ -5634,8 +5721,14 @@ boolean Audio::streamDetection(uint32_t bytesAvail) {
tmr_lost = millis() + 1000;
if(cnt_lost == 5) { // 5s no data?
cnt_lost = 0;
AUDIO_INFO("Stream lost -> try new connection");
connecttohost(m_lastHost);
if (String(m_lastHost) == "api.openai.com") {
AUDIO_INFO("End of Stream.");
m_f_running = false;
setDatamode(AUDIO_NONE);
} else {
AUDIO_INFO("Stream lost -> try new connection");
connecttohost(m_lastHost);
}
return true;
}
}
......
......@@ -128,6 +128,7 @@ public:
Audio(bool internalDAC = false, uint8_t channelEnabled = 3, uint8_t i2sPort = I2S_NUM_0); // #99
~Audio();
void setBufsize(int rambuf_sz, int psrambuf_sz);
bool openai_speech(const String& api_key, const String& model, const String& input, const String& voice, const String& response_format, const String& speed);
bool connecttohost(const char* host, const char* user = "", const char* pwd = "");
bool connecttospeech(const char* speech, const char* lang);
bool connecttoFS(fs::FS &fs, const char* path, int32_t resumeFilePos = -1);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment