1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "content/browser/speech/endpointer/endpointer.h"
8 #include "content/browser/speech/audio_buffer.h"
13 const int kFrameRate
= 50; // 1 frame = 20ms of audio.
18 Endpointer::Endpointer(int sample_rate
)
19 : speech_input_possibly_complete_silence_length_us_(-1),
20 speech_input_complete_silence_length_us_(-1),
21 audio_frame_time_us_(0),
22 sample_rate_(sample_rate
),
26 frame_size_
= static_cast<int>(sample_rate
/ static_cast<float>(kFrameRate
));
28 speech_input_minimum_length_us_
=
29 static_cast<int64
>(1.7 * Time::kMicrosecondsPerSecond
);
30 speech_input_complete_silence_length_us_
=
31 static_cast<int64
>(0.5 * Time::kMicrosecondsPerSecond
);
32 long_speech_input_complete_silence_length_us_
= -1;
33 long_speech_length_us_
= -1;
34 speech_input_possibly_complete_silence_length_us_
=
35 1 * Time::kMicrosecondsPerSecond
;
37 // Set the default configuration for Push To Talk mode.
38 EnergyEndpointerParams ep_config
;
39 ep_config
.set_frame_period(1.0f
/ static_cast<float>(kFrameRate
));
40 ep_config
.set_frame_duration(1.0f
/ static_cast<float>(kFrameRate
));
41 ep_config
.set_endpoint_margin(0.2f
);
42 ep_config
.set_onset_window(0.15f
);
43 ep_config
.set_speech_on_window(0.4f
);
44 ep_config
.set_offset_window(0.15f
);
45 ep_config
.set_onset_detect_dur(0.09f
);
46 ep_config
.set_onset_confirm_dur(0.075f
);
47 ep_config
.set_on_maintain_dur(0.10f
);
48 ep_config
.set_offset_confirm_dur(0.12f
);
49 ep_config
.set_decision_threshold(1000.0f
);
50 ep_config
.set_min_decision_threshold(50.0f
);
51 ep_config
.set_fast_update_dur(0.2f
);
52 ep_config
.set_sample_rate(static_cast<float>(sample_rate
));
53 ep_config
.set_min_fundamental_frequency(57.143f
);
54 ep_config
.set_max_fundamental_frequency(400.0f
);
55 ep_config
.set_contamination_rejection_period(0.25f
);
56 energy_endpointer_
.Init(ep_config
);
59 void Endpointer::Reset() {
60 old_ep_status_
= EP_PRE_SPEECH
;
61 waiting_for_speech_possibly_complete_timeout_
= false;
62 waiting_for_speech_complete_timeout_
= false;
63 speech_previously_detected_
= false;
64 speech_input_complete_
= false;
65 audio_frame_time_us_
= 0; // Reset time for packets sent to endpointer.
66 speech_end_time_us_
= -1;
67 speech_start_time_us_
= -1;
70 void Endpointer::StartSession() {
72 energy_endpointer_
.StartSession();
75 void Endpointer::EndSession() {
76 energy_endpointer_
.EndSession();
79 void Endpointer::SetEnvironmentEstimationMode() {
81 energy_endpointer_
.SetEnvironmentEstimationMode();
84 void Endpointer::SetUserInputMode() {
85 energy_endpointer_
.SetUserInputMode();
88 EpStatus
Endpointer::Status(int64
*time
) {
89 return energy_endpointer_
.Status(time
);
92 EpStatus
Endpointer::ProcessAudio(const AudioChunk
& raw_audio
, float* rms_out
) {
93 const int16
* audio_data
= raw_audio
.SamplesData16();
94 const int num_samples
= raw_audio
.NumSamples();
95 EpStatus ep_status
= EP_PRE_SPEECH
;
97 // Process the input data in blocks of frame_size_, dropping any incomplete
98 // frames at the end (which is ok since typically the caller will be recording
99 // audio in multiples of our frame size).
100 int sample_index
= 0;
101 while (sample_index
+ frame_size_
<= num_samples
) {
102 // Have the endpointer process the frame.
103 energy_endpointer_
.ProcessAudioFrame(audio_frame_time_us_
,
104 audio_data
+ sample_index
,
107 sample_index
+= frame_size_
;
108 audio_frame_time_us_
+= (frame_size_
* Time::kMicrosecondsPerSecond
) /
111 // Get the status of the endpointer.
113 ep_status
= energy_endpointer_
.Status(&ep_time
);
115 // Handle state changes.
116 if ((EP_SPEECH_PRESENT
== ep_status
) &&
117 (EP_POSSIBLE_ONSET
== old_ep_status_
)) {
118 speech_end_time_us_
= -1;
119 waiting_for_speech_possibly_complete_timeout_
= false;
120 waiting_for_speech_complete_timeout_
= false;
121 // Trigger SpeechInputDidStart event on first detection.
122 if (false == speech_previously_detected_
) {
123 speech_previously_detected_
= true;
124 speech_start_time_us_
= ep_time
;
127 if ((EP_PRE_SPEECH
== ep_status
) &&
128 (EP_POSSIBLE_OFFSET
== old_ep_status_
)) {
129 speech_end_time_us_
= ep_time
;
130 waiting_for_speech_possibly_complete_timeout_
= true;
131 waiting_for_speech_complete_timeout_
= true;
133 if (ep_time
> speech_input_minimum_length_us_
) {
134 // Speech possibly complete timeout.
135 if ((waiting_for_speech_possibly_complete_timeout_
) &&
136 (ep_time
- speech_end_time_us_
>
137 speech_input_possibly_complete_silence_length_us_
)) {
138 waiting_for_speech_possibly_complete_timeout_
= false;
140 if (waiting_for_speech_complete_timeout_
) {
141 // The length of the silence timeout period can be held constant, or it
142 // can be changed after a fixed amount of time from the beginning of
144 bool has_stepped_silence
=
145 (long_speech_length_us_
> 0) &&
146 (long_speech_input_complete_silence_length_us_
> 0);
147 int64 requested_silence_length
;
148 if (has_stepped_silence
&&
149 (ep_time
- speech_start_time_us_
) > long_speech_length_us_
) {
150 requested_silence_length
=
151 long_speech_input_complete_silence_length_us_
;
153 requested_silence_length
=
154 speech_input_complete_silence_length_us_
;
157 // Speech complete timeout.
158 if ((ep_time
- speech_end_time_us_
) > requested_silence_length
) {
159 waiting_for_speech_complete_timeout_
= false;
160 speech_input_complete_
= true;
164 old_ep_status_
= ep_status
;
169 } // namespace content