1
0
Fork 0
mirror of https://github.com/Luzifer/webtts.git synced 2025-03-14 17:17:48 +00:00

Add support for Azure TTS and document usage

This commit is contained in:
Knut Ahlers 2025-01-30 19:32:43 +01:00
parent 067dad5d9a
commit 07780b22e9
Signed by: luzifer
SSH key fingerprint: SHA256:/xtE5lCgiRDQr8SLxHMS92ZBlACmATUmF1crK16Ks4E
6 changed files with 202 additions and 2 deletions

1
.gitignore vendored
View file

@ -1,2 +1,3 @@
account.json
.env
webtts

View file

@ -1,3 +1,58 @@
# Luzifer / webtts
This project is a simple wrapper around the [Google Cloud Text-To-Speech](https://cloud.google.com/text-to-speech) API to output OGG Vorbis Audio to be used with OBS overlays.
This project is a simple wrapper around the [Google Cloud Text-To-Speech](https://cloud.google.com/text-to-speech) and [Azure Text-To-Speech](https://azure.microsoft.com/en-us/services/cognitive-services/speech-service/) API to output OGG Vorbis Audio to be used with OBS overlays.
## Usage
### Google Cloud Text-To-Speech
- Create a project in the [Google Cloud Console](https://console.cloud.google.com/).
- Enable the [Text-to-Speech API](https://cloud.google.com/text-to-speech/docs/apis).
- Create credentials (Service Account Key) and download it as JSON.
- Set the environment variable `GOOGLE_APPLICATION_CREDENTIALS` to the path of the downloaded JSON file.
### Azure Text-To-Speech
- Create a Text-To-Speech resource in the [Azure Portal](https://portal.azure.com/).
- Navigate to the resource and find the "Keys" section. Copy one of the keys.
- Search for a voice in the [Speech Studio](https://speech.microsoft.com/portal)
- Set the environment variable `AZURE_SPEECH_RESOURCE_KEY` to the copied key and `AZURE_SPEECH_REGION` to the region where your resource is located.
### Request
```
GET /tts.ogg
?provider=google|azure
&lang=en-US
&text=The%20text%20to%20convert%20to%20speech
&valid-to=<RFC3339-Timestamp>
&voice=<name-of-the-voice>
&signature=<HMAC-SHA256-Signature>
```
- The `signature` is an HMAC-SHA256 signature of the request parameters, using the secret key
- It contains all parameters except for the signature itself
- The parameters are sorted by name
- The HMAC is generated over the parameters concatinated with `\n`: `param1=value1\nparam2=value2\n...`
- The signature is a lower-case hex encoding of the HMAC
- The `valid-to` timestamp is a RFC3339 timestamp indicating when the request is valid
So for example these would be valid URLs for the key `topsecret`:
```
http://localhost:3000/tts.ogg
?lang=en-EN
&provider=google
&text=Hello%20there%2C%20general%20Kenobi%21
&valid-to=2025-01-31T01%3A22%3A17.405263Z
&voice=de-DE-Standard-G
&signature=afb82dc41b444f9573d585094cf4a22a517853b307d98031c9a324f294db026e
http://localhost:3000/tts.ogg
?lang=en-EN
&provider=azure
&text=Hello%20there%2C%20general%20Kenobi%21
&valid-to=2025-01-31T01%3A23%3A57.905524Z
&voice=en-US-AvaMultilingualNeural
&signature=ad4b15b78acd7d59a9d659b6b7a67dce2eee070a49634f9be66be3727eb1f5fc
```

3
go.mod
View file

@ -9,6 +9,7 @@ require (
github.com/Luzifer/go_helpers/v2 v2.25.0
github.com/Luzifer/rconfig/v2 v2.5.2
github.com/sirupsen/logrus v1.9.3
github.com/stretchr/testify v1.10.0
)
require (
@ -17,6 +18,7 @@ require (
cloud.google.com/go/auth/oauth2adapt v0.2.7 // indirect
cloud.google.com/go/compute/metadata v0.6.0 // indirect
cloud.google.com/go/longrunning v0.6.4 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
@ -24,6 +26,7 @@ require (
github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect
github.com/googleapis/gax-go/v2 v2.14.1 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/spf13/pflag v1.0.6 // indirect
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.59.0 // indirect

19
main.go
View file

@ -15,6 +15,7 @@ import (
httpHelper "github.com/Luzifer/go_helpers/v2/http"
"github.com/Luzifer/rconfig/v2"
"github.com/Luzifer/webtts/pkg/synth"
"github.com/Luzifer/webtts/pkg/synth/azure"
"github.com/Luzifer/webtts/pkg/synth/google"
)
@ -22,7 +23,7 @@ var (
cfg = struct {
Listen string `flag:"listen" default:":3000" description:"Port/IP to listen on"`
LogLevel string `flag:"log-level" default:"info" description:"Log level (debug, info, warn, error, fatal)"`
SignatureKey string `flag:"signature-key" default:"" description:"Key to sign requests with" validate:"nonzero"`
SignatureKey string `flag:"signature-key" default:"" description:"Key to sign requests with"`
VersionAndExit bool `flag:"version" default:"false" description:"Prints current version and exits"`
}{}
@ -41,6 +42,10 @@ func initApp() (err error) {
}
logrus.SetLevel(l)
if cfg.SignatureKey == "" {
logrus.Warn("no signature key is set, all requests are valid, do not do this in production!")
}
return nil
}
@ -101,6 +106,13 @@ func handleTTS(w http.ResponseWriter, r *http.Request) {
var p synth.Provider
switch provider {
case "azure":
if p, err = azure.New(); err != nil {
logrus.WithError(err).Error("creating azure provider")
http.Error(w, "creating provider", http.StatusInternalServerError)
return
}
case "google", "gcp":
if p, err = google.New(); err != nil {
logrus.WithError(err).Error("creating google provider")
@ -128,6 +140,11 @@ func handleTTS(w http.ResponseWriter, r *http.Request) {
}
func checkSignature(signature string, r *http.Request) error {
if cfg.SignatureKey == "" {
// No key given, should only happen in development
return nil
}
keys := []string{}
for k := range r.URL.Query() {
if k == "signature" {

106
pkg/synth/azure/azure.go Normal file
View file

@ -0,0 +1,106 @@
// Package azure provides a text-to-speech synthesis provider for Azure.
package azure
import (
"bytes"
"context"
"encoding/xml"
"fmt"
"io"
"net/http"
"os"
"github.com/Luzifer/webtts/pkg/synth"
"github.com/sirupsen/logrus"
)
type (
// Provider represents the Azure text-to-speech synthesis provider.
Provider struct{}
ssmlRequest struct {
XMLName xml.Name `xml:"speak"`
Text string `xml:",chardata"`
Version string `xml:"version,attr"`
Lang string `xml:"xml:lang,attr"`
Voice struct {
Text string `xml:",chardata"`
Name string `xml:"name,attr"`
} `xml:"voice"`
}
)
var _ synth.Provider = (*Provider)(nil)
// New creates a new instance of the Azure text-to-speech synthesis provider.
func New() (*Provider, error) {
speechKey := os.Getenv("AZURE_SPEECH_RESOURCE_KEY")
speechRegion := os.Getenv("AZURE_SPEECH_REGION")
if speechKey == "" || speechRegion == "" {
return nil, fmt.Errorf("missing environment variables: AZURE_SPEECH_RESOURCE_KEY and AZURE_SPEECH_REGION")
}
return &Provider{}, nil
}
// GenerateAudio generates audio from the given text using Azure's Text-to-Speech service.
func (p Provider) GenerateAudio(ctx context.Context, voice, language, text string) ([]byte, error) {
speechKey := os.Getenv("AZURE_SPEECH_RESOURCE_KEY")
speechRegion := os.Getenv("AZURE_SPEECH_REGION")
body, err := p.requestSSML(voice, language, text)
if err != nil {
return nil, fmt.Errorf("generating SSML: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, p.apiURL(speechRegion), body)
if err != nil {
return nil, fmt.Errorf("creating request: %w", err)
}
req.Header.Set("Content-Type", "application/ssml+xml")
req.Header.Set("Ocp-Apim-Subscription-Key", speechKey)
req.Header.Set("User-Agent", "webtts/0.x (https://github.com/Luzifer/webtts)")
req.Header.Set("X-Microsoft-OutputFormat", "ogg-48khz-16bit-mono-opus")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, fmt.Errorf("requesting audio: %w", err)
}
defer func() {
if err := resp.Body.Close(); err != nil {
logrus.WithError(err).Error("closing response body")
}
}()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
audioData, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("reading audio data:%w", err)
}
return audioData, nil
}
func (Provider) apiURL(region string) string {
return fmt.Sprintf("https://%s.tts.speech.microsoft.com/cognitiveservices/v1", region)
}
func (Provider) requestSSML(voice, language, text string) (io.Reader, error) {
var req ssmlRequest
req.Lang = language
req.Version = "1.0"
req.Voice.Name = voice
req.Voice.Text = text
data, err := xml.Marshal(req)
if err != nil {
return nil, fmt.Errorf("marshalling ssml request: %w", err)
}
return bytes.NewReader(data), nil
}

View file

@ -0,0 +1,18 @@
package azure
import (
"io"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestSSMLGeneration(t *testing.T) {
r, err := Provider{}.requestSSML("en-US-ChristopherNeural", "en-US", "excited to try text to speech!")
require.NoError(t, err)
data, err := io.ReadAll(r)
require.NoError(t, err)
assert.Equal(t, string(data), `<speak version="1.0" xml:lang="en-US"><voice name="en-US-ChristopherNeural">excited to try text to speech!</voice></speak>`)
}