blob: 32d5a7b4abacab955c583f9a52e0d0b94b7e3312 [file] [log] [blame]
// Copyright 2021 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
import 'dart:typed_data';
import 'package:googleapis/speech/v1.dart' as gcloud;
import 'package:logging/logging.dart';
final _log = Logger('audio_matchers');
/// Maximum request size for cloud speech RPCs is 10 MB.
///
/// We reserve 1/4 of the quota for base64 encoding.
///
/// Sending a palyload bigger than this results in an 400 error.
/// [audio limits](https://cloud.google.com/speech-to-text/quotas#content)
const _speechPayloadLimitBytes = 10 * 1024 * 1024 * 3 / 4;
/// Allow 1024 bytes of overhead in audio data payload.
const _bufferSizeBytes = 1024;
/// Safe guard maximum payload we send to Cloud Speech by [_bufferSizeBytes]
const _maxSTTRequestSizeBytes = _speechPayloadLimitBytes - _bufferSizeBytes;
/// googleapis/speech/v1.dart RecognitionConfig is missing multichannel support
class RecognitionConfig extends gcloud.RecognitionConfig {
int audioChannelCount;
bool enableSeparateRecognitionPerChannel;
@override
Map<String, dynamic> toJson() {
final json = super.toJson();
if (audioChannelCount != null) {
json['audioChannelCount'] = audioChannelCount;
}
if (enableSeparateRecognitionPerChannel != null) {
json['enableSeparateRecognitionPerChannel'] =
enableSeparateRecognitionPerChannel;
}
return json;
}
}
/// Invokes the speech to text API on a byte array [data] containing an audio
/// file.
///
/// Returns a list of transcripts, potentially (but not necessarily) for
/// each audio channel.
Future<Iterable<String>> speechToText(
gcloud.SpeechApi api, List<int> data, String languageCode) async {
// Field selector constructed using
// https://developers.google.com/apis-explorer fields editor.
const $fields = 'results/alternatives/transcript';
List<int> audioData = data;
if (!data.every((element) => (element >= 0 && element <= 255))) {
_log.warning('Found invalid audio data, data needs to be 8-bit value.');
// Force data to be 8-bit.
audioData = Uint8List.fromList(data);
}
if (audioData.length > _maxSTTRequestSizeBytes) {
_log.warning('Truncating speechToText audio to ${_maxSTTRequestSizeBytes}B.'
' Data was ${data.length}B');
audioData = data.sublist(0, _maxSTTRequestSizeBytes.toInt());
}
final request = gcloud.RecognizeRequest()
..config = (RecognitionConfig()
..languageCode = languageCode
..audioChannelCount = 2
// Setting this to false only transcribes the first channel.
..enableSeparateRecognitionPerChannel = false)
..audio = (gcloud.RecognitionAudio()..contentAsBytes = audioData);
_log.fine('Calling api.speech.recognize for ${$fields}');
final results =
(await api.speech.recognize(request, $fields: $fields)).results;
if (results == null) {
return null;
}
// We don't always get more than one result, but when we do, hopefully it's
// because we did per-channel recognition.
final transcripts = results.map((r) => r.alternatives.single.transcript);
_log.info('Transcripts: ${transcripts.map((t) => '"$t"').join(', ')}');
return transcripts;
}
/// Prints a warning if the transcription doesn't match [pat] or is empty.
void warnOnTranscriptionNotMatching(
Iterable<String> transcriptions, Pattern pat) {
if (transcriptions == null || transcriptions.isEmpty) {
_log.warning('No recognized speech response');
return;
}
final notMatched =
transcriptions.firstWhere((t) => !t.contains(pat), orElse: () => null);
if (notMatched != null) {
_log.warning('Audio transcription "$notMatched" did not match "$pat"');
}
}