218 lines
5.6 KiB
Go
218 lines
5.6 KiB
Go
package service
|
|
|
|
import (
|
|
speech "cloud.google.com/go/speech/apiv1"
|
|
"cloud.google.com/go/speech/apiv1/speechpb"
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"google.golang.org/api/option"
|
|
"google.golang.org/protobuf/types/known/wrapperspb"
|
|
"io"
|
|
"log"
|
|
"mime/multipart"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
type Service struct {
|
|
mux *http.ServeMux
|
|
client *http.Client
|
|
speech *speech.Client
|
|
}
|
|
|
|
func New() *Service {
|
|
sc, err := speech.NewClient(context.Background(),
|
|
option.WithAPIKey("some key"))
|
|
if err != nil {
|
|
panic(fmt.Sprintf("speech startup failed: %s!", err))
|
|
}
|
|
s := &Service{
|
|
mux: http.NewServeMux(),
|
|
client: &http.Client{
|
|
Timeout: 5 * time.Second,
|
|
},
|
|
speech: sc,
|
|
}
|
|
s.mux.HandleFunc("/NmspServlet/", s.nmsp)
|
|
return s
|
|
}
|
|
|
|
func (s *Service) Close() error {
|
|
return s.speech.Close()
|
|
}
|
|
|
|
func (s *Service) nmsp(rw http.ResponseWriter, request *http.Request) {
|
|
accessToken, lang, err := tokensFromHost(request.Host)
|
|
if err != nil {
|
|
http.Error(rw, err.Error(), http.StatusBadRequest)
|
|
return
|
|
}
|
|
if err := s.authenticate(accessToken, request.Context()); err != nil {
|
|
http.Error(rw, err.Error(), http.StatusUnauthorized)
|
|
return
|
|
}
|
|
|
|
reader, err := request.MultipartReader()
|
|
if err != nil {
|
|
http.Error(rw, fmt.Sprintf("failed to open reader: %s", err), 400)
|
|
return
|
|
}
|
|
|
|
// start an async request to ASR
|
|
rec, err := s.speech.StreamingRecognize(request.Context())
|
|
if err != nil {
|
|
http.Error(rw, fmt.Sprintf("starting speech recognition failed: %s", err), http.StatusInternalServerError)
|
|
}
|
|
|
|
err = rec.Send(&speechpb.StreamingRecognizeRequest{
|
|
StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{
|
|
StreamingConfig: &speechpb.StreamingRecognitionConfig{
|
|
Config: &speechpb.RecognitionConfig{
|
|
Encoding: speechpb.RecognitionConfig_SPEEX_WITH_HEADER_BYTE,
|
|
SampleRateHertz: 16000,
|
|
AudioChannelCount: 1,
|
|
LanguageCode: lang,
|
|
MaxAlternatives: 1,
|
|
ProfanityFilter: false,
|
|
EnableWordTimeOffsets: false,
|
|
EnableWordConfidence: true, // this gets passed all the way to Pebble, which does nothing?
|
|
EnableAutomaticPunctuation: true, // controversial.
|
|
EnableSpokenPunctuation: wrapperspb.Bool(true),
|
|
EnableSpokenEmojis: wrapperspb.Bool(false), // tentatively disabled due to poor font support.
|
|
Metadata: &speechpb.RecognitionMetadata{
|
|
MicrophoneDistance: speechpb.RecognitionMetadata_NEARFIELD,
|
|
},
|
|
Model: "latest_short", // apparently this is the New Hotness
|
|
UseEnhanced: false,
|
|
},
|
|
SingleUtterance: false, // end-of-utterance detection is done by Pebble
|
|
InterimResults: false, // the pebble protocol doesn't have any way to deal with these
|
|
EnableVoiceActivityEvents: false, // we don't need this
|
|
VoiceActivityTimeout: nil, // again, handled by the Pebble device.
|
|
},
|
|
},
|
|
})
|
|
if err != nil {
|
|
http.Error(rw, fmt.Sprintf("setting up dictation session failed: %s", err), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
go func() {
|
|
for {
|
|
part, err := reader.NextPart()
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
if err != nil {
|
|
break
|
|
}
|
|
if part.FormName() == "ConcludingAudioParameter" {
|
|
bytes, err := io.ReadAll(part)
|
|
if err != nil {
|
|
// TODO: something?
|
|
break
|
|
}
|
|
err = rec.Send(&speechpb.StreamingRecognizeRequest{
|
|
StreamingRequest: &speechpb.StreamingRecognizeRequest_AudioContent{
|
|
AudioContent: bytes,
|
|
},
|
|
})
|
|
if err != nil {
|
|
// TODO: something?
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}()
|
|
|
|
mp := multipart.NewWriter(rw)
|
|
must(mp.SetBoundary("--Nuance_NMSP_vutc5w1XobDdefsYG3wq"))
|
|
rw.Header().Set("Content-Type", "multipart/form-data; boundary="+mp.Boundary())
|
|
rw.WriteHeader(200)
|
|
|
|
firstWord := true
|
|
|
|
for {
|
|
resp, err := rec.Recv()
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
|
|
tr := TranscriptionResponse{}
|
|
|
|
for _, result := range resp.Results {
|
|
if len(result.Alternatives) == 0 {
|
|
continue
|
|
}
|
|
for _, word := range result.Alternatives[0].Words {
|
|
tr.Words = append(tr.Words, TranscriptionWord{
|
|
Word: word.Word,
|
|
Confidence: word.Confidence,
|
|
})
|
|
}
|
|
}
|
|
|
|
if firstWord && len(tr.Words) > 0 {
|
|
tr.Words[0].Word = tr.Words[0].Word + `\*no-space-before`
|
|
firstWord = false
|
|
}
|
|
|
|
content, err := json.Marshal(tr)
|
|
if err != nil {
|
|
log.Printf("Failed to marshal response JSON: %s", err)
|
|
continue
|
|
}
|
|
|
|
if err := mp.WriteField("QueryResult", string(content)); err != nil {
|
|
log.Printf("Failed to write response JSON: %s", err)
|
|
continue
|
|
}
|
|
}
|
|
_ = mp.Close()
|
|
}
|
|
|
|
func must(err error) {
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
}
|
|
|
|
type TranscriptionWord struct {
|
|
Word string `json:"word"`
|
|
Confidence float32 `json:"confidence"`
|
|
}
|
|
|
|
type TranscriptionResponse struct {
|
|
Words []TranscriptionWord `json:"words"`
|
|
}
|
|
|
|
func (s *Service) authenticate(accessToken string, ctx context.Context) error {
|
|
authReq, err := http.NewRequestWithContext(ctx, "GET", "%s/api/v1/me/token", nil)
|
|
if err != nil {
|
|
return fmt.Errorf("couldn't create auth request: %w", err)
|
|
}
|
|
authReq.Header.Add("Authorization", "Bearer "+accessToken)
|
|
resp, err := s.client.Do(authReq)
|
|
if err != nil {
|
|
return fmt.Errorf("requesting authorization failed: %w", err)
|
|
}
|
|
if resp.StatusCode != 200 {
|
|
return errors.New("unauthorised")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func tokensFromHost(host string) (accessToken, lang string, err error) {
|
|
hostParts := strings.SplitN(strings.SplitN(host, ".", 2)[0], "-", 2)
|
|
if len(hostParts) != 2 {
|
|
return "", "", fmt.Errorf("invalid host %q", host)
|
|
}
|
|
return hostParts[0], hostParts[1], nil
|
|
}
|
|
|
|
func (s *Service) ListenAndServe(addr string) error {
|
|
return http.ListenAndServe(addr, s.mux)
|
|
}
|