mirror of
https://github.com/Luzifer/webtts.git
synced 2025-03-14 17:17:48 +00:00
Add support for Azure TTS and document usage
This commit is contained in:
parent
067dad5d9a
commit
07780b22e9
6 changed files with 202 additions and 2 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,2 +1,3 @@
|
|||
account.json
|
||||
.env
|
||||
webtts
|
||||
|
|
57
README.md
57
README.md
|
@ -1,3 +1,58 @@
|
|||
# Luzifer / webtts
|
||||
|
||||
This project is a simple wrapper around the [Google Cloud Text-To-Speech](https://cloud.google.com/text-to-speech) API to output OGG Vorbis Audio to be used with OBS overlays.
|
||||
This project is a simple wrapper around the [Google Cloud Text-To-Speech](https://cloud.google.com/text-to-speech) and [Azure Text-To-Speech](https://azure.microsoft.com/en-us/services/cognitive-services/speech-service/) API to output OGG Vorbis Audio to be used with OBS overlays.
|
||||
|
||||
## Usage
|
||||
|
||||
### Google Cloud Text-To-Speech
|
||||
|
||||
- Create a project in the [Google Cloud Console](https://console.cloud.google.com/).
|
||||
- Enable the [Text-to-Speech API](https://cloud.google.com/text-to-speech/docs/apis).
|
||||
- Create credentials (Service Account Key) and download it as JSON.
|
||||
- Set the environment variable `GOOGLE_APPLICATION_CREDENTIALS` to the path of the downloaded JSON file.
|
||||
|
||||
### Azure Text-To-Speech
|
||||
|
||||
- Create a Text-To-Speech resource in the [Azure Portal](https://portal.azure.com/).
|
||||
- Navigate to the resource and find the "Keys" section. Copy one of the keys.
|
||||
- Search for a voice in the [Speech Studio](https://speech.microsoft.com/portal)
|
||||
- Set the environment variable `AZURE_SPEECH_RESOURCE_KEY` to the copied key and `AZURE_SPEECH_REGION` to the region where your resource is located.
|
||||
|
||||
### Request
|
||||
|
||||
```
|
||||
GET /tts.ogg
|
||||
?provider=google|azure
|
||||
&lang=en-US
|
||||
&text=The%20text%20to%20convert%20to%20speech
|
||||
&valid-to=<RFC3339-Timestamp>
|
||||
&voice=<name-of-the-voice>
|
||||
&signature=<HMAC-SHA256-Signature>
|
||||
```
|
||||
|
||||
- The `signature` is an HMAC-SHA256 signature of the request parameters, using the secret key
|
||||
- It contains all parameters except for the signature itself
|
||||
- The parameters are sorted by name
|
||||
- The HMAC is generated over the parameters concatinated with `\n`: `param1=value1\nparam2=value2\n...`
|
||||
- The signature is a lower-case hex encoding of the HMAC
|
||||
- The `valid-to` timestamp is a RFC3339 timestamp indicating when the request is valid
|
||||
|
||||
So for example these would be valid URLs for the key `topsecret`:
|
||||
|
||||
```
|
||||
http://localhost:3000/tts.ogg
|
||||
?lang=en-EN
|
||||
&provider=google
|
||||
&text=Hello%20there%2C%20general%20Kenobi%21
|
||||
&valid-to=2025-01-31T01%3A22%3A17.405263Z
|
||||
&voice=de-DE-Standard-G
|
||||
&signature=afb82dc41b444f9573d585094cf4a22a517853b307d98031c9a324f294db026e
|
||||
|
||||
http://localhost:3000/tts.ogg
|
||||
?lang=en-EN
|
||||
&provider=azure
|
||||
&text=Hello%20there%2C%20general%20Kenobi%21
|
||||
&valid-to=2025-01-31T01%3A23%3A57.905524Z
|
||||
&voice=en-US-AvaMultilingualNeural
|
||||
&signature=ad4b15b78acd7d59a9d659b6b7a67dce2eee070a49634f9be66be3727eb1f5fc
|
||||
```
|
||||
|
|
3
go.mod
3
go.mod
|
@ -9,6 +9,7 @@ require (
|
|||
github.com/Luzifer/go_helpers/v2 v2.25.0
|
||||
github.com/Luzifer/rconfig/v2 v2.5.2
|
||||
github.com/sirupsen/logrus v1.9.3
|
||||
github.com/stretchr/testify v1.10.0
|
||||
)
|
||||
|
||||
require (
|
||||
|
@ -17,6 +18,7 @@ require (
|
|||
cloud.google.com/go/auth/oauth2adapt v0.2.7 // indirect
|
||||
cloud.google.com/go/compute/metadata v0.6.0 // indirect
|
||||
cloud.google.com/go/longrunning v0.6.4 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/felixge/httpsnoop v1.0.4 // indirect
|
||||
github.com/go-logr/logr v1.4.2 // indirect
|
||||
github.com/go-logr/stdr v1.2.2 // indirect
|
||||
|
@ -24,6 +26,7 @@ require (
|
|||
github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect
|
||||
github.com/googleapis/gax-go/v2 v2.14.1 // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/spf13/pflag v1.0.6 // indirect
|
||||
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
|
||||
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.59.0 // indirect
|
||||
|
|
19
main.go
19
main.go
|
@ -15,6 +15,7 @@ import (
|
|||
httpHelper "github.com/Luzifer/go_helpers/v2/http"
|
||||
"github.com/Luzifer/rconfig/v2"
|
||||
"github.com/Luzifer/webtts/pkg/synth"
|
||||
"github.com/Luzifer/webtts/pkg/synth/azure"
|
||||
"github.com/Luzifer/webtts/pkg/synth/google"
|
||||
)
|
||||
|
||||
|
@ -22,7 +23,7 @@ var (
|
|||
cfg = struct {
|
||||
Listen string `flag:"listen" default:":3000" description:"Port/IP to listen on"`
|
||||
LogLevel string `flag:"log-level" default:"info" description:"Log level (debug, info, warn, error, fatal)"`
|
||||
SignatureKey string `flag:"signature-key" default:"" description:"Key to sign requests with" validate:"nonzero"`
|
||||
SignatureKey string `flag:"signature-key" default:"" description:"Key to sign requests with"`
|
||||
VersionAndExit bool `flag:"version" default:"false" description:"Prints current version and exits"`
|
||||
}{}
|
||||
|
||||
|
@ -41,6 +42,10 @@ func initApp() (err error) {
|
|||
}
|
||||
logrus.SetLevel(l)
|
||||
|
||||
if cfg.SignatureKey == "" {
|
||||
logrus.Warn("no signature key is set, all requests are valid, do not do this in production!")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -101,6 +106,13 @@ func handleTTS(w http.ResponseWriter, r *http.Request) {
|
|||
|
||||
var p synth.Provider
|
||||
switch provider {
|
||||
case "azure":
|
||||
if p, err = azure.New(); err != nil {
|
||||
logrus.WithError(err).Error("creating azure provider")
|
||||
http.Error(w, "creating provider", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
case "google", "gcp":
|
||||
if p, err = google.New(); err != nil {
|
||||
logrus.WithError(err).Error("creating google provider")
|
||||
|
@ -128,6 +140,11 @@ func handleTTS(w http.ResponseWriter, r *http.Request) {
|
|||
}
|
||||
|
||||
func checkSignature(signature string, r *http.Request) error {
|
||||
if cfg.SignatureKey == "" {
|
||||
// No key given, should only happen in development
|
||||
return nil
|
||||
}
|
||||
|
||||
keys := []string{}
|
||||
for k := range r.URL.Query() {
|
||||
if k == "signature" {
|
||||
|
|
106
pkg/synth/azure/azure.go
Normal file
106
pkg/synth/azure/azure.go
Normal file
|
@ -0,0 +1,106 @@
|
|||
// Package azure provides a text-to-speech synthesis provider for Azure.
|
||||
package azure
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
|
||||
"github.com/Luzifer/webtts/pkg/synth"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
type (
|
||||
// Provider represents the Azure text-to-speech synthesis provider.
|
||||
Provider struct{}
|
||||
|
||||
ssmlRequest struct {
|
||||
XMLName xml.Name `xml:"speak"`
|
||||
Text string `xml:",chardata"`
|
||||
Version string `xml:"version,attr"`
|
||||
Lang string `xml:"xml:lang,attr"`
|
||||
Voice struct {
|
||||
Text string `xml:",chardata"`
|
||||
Name string `xml:"name,attr"`
|
||||
} `xml:"voice"`
|
||||
}
|
||||
)
|
||||
|
||||
var _ synth.Provider = (*Provider)(nil)
|
||||
|
||||
// New creates a new instance of the Azure text-to-speech synthesis provider.
|
||||
func New() (*Provider, error) {
|
||||
speechKey := os.Getenv("AZURE_SPEECH_RESOURCE_KEY")
|
||||
speechRegion := os.Getenv("AZURE_SPEECH_REGION")
|
||||
|
||||
if speechKey == "" || speechRegion == "" {
|
||||
return nil, fmt.Errorf("missing environment variables: AZURE_SPEECH_RESOURCE_KEY and AZURE_SPEECH_REGION")
|
||||
}
|
||||
|
||||
return &Provider{}, nil
|
||||
}
|
||||
|
||||
// GenerateAudio generates audio from the given text using Azure's Text-to-Speech service.
|
||||
func (p Provider) GenerateAudio(ctx context.Context, voice, language, text string) ([]byte, error) {
|
||||
speechKey := os.Getenv("AZURE_SPEECH_RESOURCE_KEY")
|
||||
speechRegion := os.Getenv("AZURE_SPEECH_REGION")
|
||||
|
||||
body, err := p.requestSSML(voice, language, text)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("generating SSML: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, p.apiURL(speechRegion), body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("creating request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", "application/ssml+xml")
|
||||
req.Header.Set("Ocp-Apim-Subscription-Key", speechKey)
|
||||
req.Header.Set("User-Agent", "webtts/0.x (https://github.com/Luzifer/webtts)")
|
||||
req.Header.Set("X-Microsoft-OutputFormat", "ogg-48khz-16bit-mono-opus")
|
||||
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("requesting audio: %w", err)
|
||||
}
|
||||
defer func() {
|
||||
if err := resp.Body.Close(); err != nil {
|
||||
logrus.WithError(err).Error("closing response body")
|
||||
}
|
||||
}()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
audioData, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading audio data:%w", err)
|
||||
}
|
||||
|
||||
return audioData, nil
|
||||
}
|
||||
|
||||
func (Provider) apiURL(region string) string {
|
||||
return fmt.Sprintf("https://%s.tts.speech.microsoft.com/cognitiveservices/v1", region)
|
||||
}
|
||||
|
||||
func (Provider) requestSSML(voice, language, text string) (io.Reader, error) {
|
||||
var req ssmlRequest
|
||||
req.Lang = language
|
||||
req.Version = "1.0"
|
||||
req.Voice.Name = voice
|
||||
req.Voice.Text = text
|
||||
|
||||
data, err := xml.Marshal(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("marshalling ssml request: %w", err)
|
||||
}
|
||||
|
||||
return bytes.NewReader(data), nil
|
||||
}
|
18
pkg/synth/azure/azure_test.go
Normal file
18
pkg/synth/azure/azure_test.go
Normal file
|
@ -0,0 +1,18 @@
|
|||
package azure
|
||||
|
||||
import (
|
||||
"io"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestSSMLGeneration(t *testing.T) {
|
||||
r, err := Provider{}.requestSSML("en-US-ChristopherNeural", "en-US", "excited to try text to speech!")
|
||||
require.NoError(t, err)
|
||||
|
||||
data, err := io.ReadAll(r)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, string(data), `<speak version="1.0" xml:lang="en-US"><voice name="en-US-ChristopherNeural">excited to try text to speech!</voice></speak>`)
|
||||
}
|
Loading…
Add table
Reference in a new issue