mirror of
https://github.com/Luzifer/webtts.git
synced 2025-03-31 17:15:15 +00:00
106 lines
2.8 KiB
Go
106 lines
2.8 KiB
Go
// Package azure provides a text-to-speech synthesis provider for Azure.
|
|
package azure
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/xml"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"os"
|
|
|
|
"github.com/Luzifer/webtts/pkg/synth"
|
|
"github.com/sirupsen/logrus"
|
|
)
|
|
|
|
type (
|
|
// Provider represents the Azure text-to-speech synthesis provider.
|
|
Provider struct{}
|
|
|
|
ssmlRequest struct {
|
|
XMLName xml.Name `xml:"speak"`
|
|
Text string `xml:",chardata"`
|
|
Version string `xml:"version,attr"`
|
|
Lang string `xml:"xml:lang,attr"`
|
|
Voice struct {
|
|
Text string `xml:",chardata"`
|
|
Name string `xml:"name,attr"`
|
|
} `xml:"voice"`
|
|
}
|
|
)
|
|
|
|
var _ synth.Provider = (*Provider)(nil)
|
|
|
|
// New creates a new instance of the Azure text-to-speech synthesis provider.
|
|
func New() (*Provider, error) {
|
|
speechKey := os.Getenv("AZURE_SPEECH_RESOURCE_KEY")
|
|
speechRegion := os.Getenv("AZURE_SPEECH_REGION")
|
|
|
|
if speechKey == "" || speechRegion == "" {
|
|
return nil, fmt.Errorf("missing environment variables: AZURE_SPEECH_RESOURCE_KEY and AZURE_SPEECH_REGION")
|
|
}
|
|
|
|
return &Provider{}, nil
|
|
}
|
|
|
|
// GenerateAudio generates audio from the given text using Azure's Text-to-Speech service.
|
|
func (p Provider) GenerateAudio(ctx context.Context, voice, language, text string) ([]byte, error) {
|
|
speechKey := os.Getenv("AZURE_SPEECH_RESOURCE_KEY")
|
|
speechRegion := os.Getenv("AZURE_SPEECH_REGION")
|
|
|
|
body, err := p.requestSSML(voice, language, text)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("generating SSML: %w", err)
|
|
}
|
|
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodPost, p.apiURL(speechRegion), body)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("creating request: %w", err)
|
|
}
|
|
|
|
req.Header.Set("Content-Type", "application/ssml+xml")
|
|
req.Header.Set("Ocp-Apim-Subscription-Key", speechKey)
|
|
req.Header.Set("User-Agent", "webtts/0.x (https://github.com/Luzifer/webtts)")
|
|
req.Header.Set("X-Microsoft-OutputFormat", "ogg-48khz-16bit-mono-opus")
|
|
|
|
resp, err := http.DefaultClient.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("requesting audio: %w", err)
|
|
}
|
|
defer func() {
|
|
if err := resp.Body.Close(); err != nil {
|
|
logrus.WithError(err).Error("closing response body")
|
|
}
|
|
}()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
|
|
}
|
|
|
|
audioData, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("reading audio data:%w", err)
|
|
}
|
|
|
|
return audioData, nil
|
|
}
|
|
|
|
func (Provider) apiURL(region string) string {
|
|
return fmt.Sprintf("https://%s.tts.speech.microsoft.com/cognitiveservices/v1", region)
|
|
}
|
|
|
|
func (Provider) requestSSML(voice, language, text string) (io.Reader, error) {
|
|
var req ssmlRequest
|
|
req.Lang = language
|
|
req.Version = "1.0"
|
|
req.Voice.Name = voice
|
|
req.Voice.Text = text
|
|
|
|
data, err := xml.Marshal(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("marshalling ssml request: %w", err)
|
|
}
|
|
|
|
return bytes.NewReader(data), nil
|
|
}
|