// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
import 'dart:typed_data';
import 'package:googleapis/speech/v1.dart' as gcloud;
import 'package:logging/logging.dart';
final _log = Logger('audio_matchers');
/// Maximum request size for cloud speech RPCs is 10 MB.
/// We reserve 1/4 of the quota for base64 encoding.
/// Sending a palyload bigger than this results in an 400 error.
/// [audio limits](
const _speechPayloadLimitBytes = 10 * 1024 * 1024 * 3 / 4;
/// Allow 1024 bytes of overhead in audio data payload.
const _bufferSizeBytes = 1024;
/// Safe guard maximum payload we send to Cloud Speech by [_bufferSizeBytes]
const _maxSTTRequestSizeBytes = _speechPayloadLimitBytes - _bufferSizeBytes;
/// googleapis/speech/v1.dart RecognitionConfig is missing multichannel support
class RecognitionConfig extends gcloud.RecognitionConfig {
int audioChannelCount;
bool enableSeparateRecognitionPerChannel;
Map<String, dynamic> toJson() {
final json = super.toJson();
if (audioChannelCount != null) {
json['audioChannelCount'] = audioChannelCount;
if (enableSeparateRecognitionPerChannel != null) {
json['enableSeparateRecognitionPerChannel'] =
return json;
/// Invokes the speech to text API on a byte array [data] containing an audio
/// file.
/// Returns a list of transcripts, potentially (but not necessarily) for
/// each audio channel.
Future<Iterable<String>> speechToText(
gcloud.SpeechApi api, List<int> data, String languageCode) async {
// Field selector constructed using
// fields editor.
const $fields = 'results/alternatives/transcript';
List<int> audioData = data;
if (!data.every((element) => (element >= 0 && element <= 255))) {
_log.warning('Found invalid audio data, data needs to be 8-bit value.');
// Force data to be 8-bit.
audioData = Uint8List.fromList(data);
if (audioData.length > _maxSTTRequestSizeBytes) {
_log.warning('Truncating speechToText audio to ${_maxSTTRequestSizeBytes}B.'
' Data was ${data.length}B');
audioData = data.sublist(0, _maxSTTRequestSizeBytes.toInt());
final request = gcloud.RecognizeRequest()
..config = (RecognitionConfig()
..languageCode = languageCode
..audioChannelCount = 2
// Setting this to false only transcribes the first channel.
..enableSeparateRecognitionPerChannel = false) = (gcloud.RecognitionAudio()..contentAsBytes = audioData);
_log.fine('Calling api.speech.recognize for ${$fields}');
final results =
(await api.speech.recognize(request, $fields: $fields)).results;
if (results == null) {
return null;
// We don't always get more than one result, but when we do, hopefully it's
// because we did per-channel recognition.
final transcripts = => r.alternatives.single.transcript);'Transcripts: ${ => '"$t"').join(', ')}');
return transcripts;
/// Prints a warning if the transcription doesn't match [pat] or is empty.
void warnOnTranscriptionNotMatching(
Iterable<String> transcriptions, Pattern pat) {
if (transcriptions == null || transcriptions.isEmpty) {
_log.warning('No recognized speech response');
final notMatched =
transcriptions.firstWhere((t) => !t.contains(pat), orElse: () => null);
if (notMatched != null) {
_log.warning('Audio transcription "$notMatched" did not match "$pat"');