Cloud Speech-to-Text API を試す
doc
ここからAPIを有効にする
実行
code:sh
GOOGLE_APPLICATION_CREDENTIALS=<credential file path> go run main.go
サンプルコード
code:sh
package main
import (
"context"
"fmt"
"io"
"io/ioutil"
"log"
"os"
speech "cloud.google.com/go/speech/apiv1"
speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
)
var (
file = "hoge.wav"
)
func main() {
if err := transcribeMultichannel(os.Stdout, file); err != nil {
log.Fatal(err)
}
}
func transcribeMultichannel(w io.Writer, path string) error {
ctx := context.Background()
client, err := speech.NewClient(ctx)
if err != nil {
return fmt.Errorf("NewClient: %v", err)
}
data, err := ioutil.ReadFile(path)
if err != nil {
return fmt.Errorf("ReadFile: %v", err)
}
resp, err := client.Recognize(ctx, &speechpb.RecognizeRequest{
Config: &speechpb.RecognitionConfig{
Encoding: speechpb.RecognitionConfig_LINEAR16,
SampleRateHertz: 16000,
LanguageCode: "ja-JP",
AudioChannelCount: 2,
EnableSeparateRecognitionPerChannel: true,
EnableWordTimeOffsets: true,
EnableAutomaticPunctuation: true,
},
Audio: &speechpb.RecognitionAudio{
AudioSource: &speechpb.RecognitionAudio_Content{Content: data},
},
})
if err != nil {
return fmt.Errorf("Recognize: %v", err)
}
// Print the results.
for _, result := range resp.Results {
for _, alt := range result.Alternatives {
fmt.Fprintf(w, "Channel %v\n", result.ChannelTag)
for _, wd := range alt.Words {
fmt.Fprintf(w, " %d ~ %d %s\n", wd.StartTime.Seconds, wd.EndTime.Seconds, wd.Word) }
}
}
return nil
}
ebiken.icon
ストリーミングでリアルタイム書き起こしとかもできそうなので、どれくらいの精度やレイテンシがあるのか気になる