1
0
Fork 0
mirror of https://github.com/Luzifer/webtts.git synced 2025-03-31 17:15:15 +00:00
webtts/pkg/synth/azure/azure.go

106 lines
2.8 KiB
Go

// Package azure provides a text-to-speech synthesis provider for Azure.
package azure
import (
"bytes"
"context"
"encoding/xml"
"fmt"
"io"
"net/http"
"os"
"github.com/Luzifer/webtts/pkg/synth"
"github.com/sirupsen/logrus"
)
type (
// Provider represents the Azure text-to-speech synthesis provider.
Provider struct{}
ssmlRequest struct {
XMLName xml.Name `xml:"speak"`
Text string `xml:",chardata"`
Version string `xml:"version,attr"`
Lang string `xml:"xml:lang,attr"`
Voice struct {
Text string `xml:",chardata"`
Name string `xml:"name,attr"`
} `xml:"voice"`
}
)
var _ synth.Provider = (*Provider)(nil)
// New creates a new instance of the Azure text-to-speech synthesis provider.
func New() (*Provider, error) {
speechKey := os.Getenv("AZURE_SPEECH_RESOURCE_KEY")
speechRegion := os.Getenv("AZURE_SPEECH_REGION")
if speechKey == "" || speechRegion == "" {
return nil, fmt.Errorf("missing environment variables: AZURE_SPEECH_RESOURCE_KEY and AZURE_SPEECH_REGION")
}
return &Provider{}, nil
}
// GenerateAudio generates audio from the given text using Azure's Text-to-Speech service.
func (p Provider) GenerateAudio(ctx context.Context, voice, language, text string) ([]byte, error) {
speechKey := os.Getenv("AZURE_SPEECH_RESOURCE_KEY")
speechRegion := os.Getenv("AZURE_SPEECH_REGION")
body, err := p.requestSSML(voice, language, text)
if err != nil {
return nil, fmt.Errorf("generating SSML: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, p.apiURL(speechRegion), body)
if err != nil {
return nil, fmt.Errorf("creating request: %w", err)
}
req.Header.Set("Content-Type", "application/ssml+xml")
req.Header.Set("Ocp-Apim-Subscription-Key", speechKey)
req.Header.Set("User-Agent", "webtts/0.x (https://github.com/Luzifer/webtts)")
req.Header.Set("X-Microsoft-OutputFormat", "ogg-48khz-16bit-mono-opus")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, fmt.Errorf("requesting audio: %w", err)
}
defer func() {
if err := resp.Body.Close(); err != nil {
logrus.WithError(err).Error("closing response body")
}
}()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
audioData, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("reading audio data:%w", err)
}
return audioData, nil
}
func (Provider) apiURL(region string) string {
return fmt.Sprintf("https://%s.tts.speech.microsoft.com/cognitiveservices/v1", region)
}
func (Provider) requestSSML(voice, language, text string) (io.Reader, error) {
var req ssmlRequest
req.Lang = language
req.Version = "1.0"
req.Voice.Name = voice
req.Voice.Text = text
data, err := xml.Marshal(req)
if err != nil {
return nil, fmt.Errorf("marshalling ssml request: %w", err)
}
return bytes.NewReader(data), nil
}