1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // To know more about the algorithm used and the original code which this is
7 // https://wiki.corp.google.com/twiki/bin/view/Main/ChromeGoogleCodeXRef
9 #include "content/browser/speech/endpointer/energy_endpointer.h"
13 #include "base/logging.h"
17 // Returns the RMS (quadratic mean) of the input signal.
18 float RMS(const int16
* samples
, int num_samples
) {
21 for (int i
= 0; i
< num_samples
; ++i
) {
22 sum_int64
+= samples
[i
];
23 ssq_int64
+= samples
[i
] * samples
[i
];
25 // now convert to floats.
26 double sum
= static_cast<double>(sum_int64
);
28 double ssq
= static_cast<double>(ssq_int64
);
29 return static_cast<float>(sqrt((ssq
/ num_samples
) - (sum
* sum
)));
32 int64
Secs2Usecs(float seconds
) {
33 return static_cast<int64
>(0.5 + (1.0e6
* seconds
));
36 float GetDecibel(float value
) {
38 return 20 * log10(value
);
46 // Stores threshold-crossing histories for making decisions about the speech
48 class EnergyEndpointer::HistoryRing
{
50 HistoryRing() : insertion_index_(0) {}
52 // Resets the ring to |size| elements each with state |initial_state|
53 void SetRing(int size
, bool initial_state
);
55 // Inserts a new entry into the ring and drops the oldest entry.
56 void Insert(int64 time_us
, bool decision
);
58 // Returns the time in microseconds of the most recently added entry.
59 int64
EndTime() const;
61 // Returns the sum of all intervals during which 'decision' is true within
62 // the time in seconds specified by 'duration'. The returned interval is
64 float RingSum(float duration_sec
);
67 struct DecisionPoint
{
72 std::vector
<DecisionPoint
> decision_points_
;
73 int insertion_index_
; // Index at which the next item gets added/inserted.
75 DISALLOW_COPY_AND_ASSIGN(HistoryRing
);
78 void EnergyEndpointer::HistoryRing::SetRing(int size
, bool initial_state
) {
80 decision_points_
.clear();
81 DecisionPoint init
= { -1, initial_state
};
82 decision_points_
.resize(size
, init
);
85 void EnergyEndpointer::HistoryRing::Insert(int64 time_us
, bool decision
) {
86 decision_points_
[insertion_index_
].time_us
= time_us
;
87 decision_points_
[insertion_index_
].decision
= decision
;
88 insertion_index_
= (insertion_index_
+ 1) % decision_points_
.size();
91 int64
EnergyEndpointer::HistoryRing::EndTime() const {
92 int ind
= insertion_index_
- 1;
94 ind
= decision_points_
.size() - 1;
95 return decision_points_
[ind
].time_us
;
98 float EnergyEndpointer::HistoryRing::RingSum(float duration_sec
) {
99 if (!decision_points_
.size())
103 int ind
= insertion_index_
- 1;
105 ind
= decision_points_
.size() - 1;
106 int64 end_us
= decision_points_
[ind
].time_us
;
107 bool is_on
= decision_points_
[ind
].decision
;
108 int64 start_us
= end_us
- static_cast<int64
>(0.5 + (1.0e6
* duration_sec
));
111 size_t n_summed
= 1; // n points ==> (n-1) intervals
112 while ((decision_points_
[ind
].time_us
> start_us
) &&
113 (n_summed
< decision_points_
.size())) {
116 ind
= decision_points_
.size() - 1;
118 sum_us
+= end_us
- decision_points_
[ind
].time_us
;
119 is_on
= decision_points_
[ind
].decision
;
120 end_us
= decision_points_
[ind
].time_us
;
124 return 1.0e-6f
* sum_us
; // Returns total time that was super threshold.
127 EnergyEndpointer::EnergyEndpointer()
128 : status_(EP_PRE_SPEECH
),
129 offset_confirm_dur_sec_(0),
130 endpointer_time_us_(0),
131 fast_update_frames_(0),
133 max_window_dur_(4.0),
135 history_(new HistoryRing()),
136 decision_threshold_(0),
137 estimating_environment_(false),
142 user_input_start_time_us_(0) {
145 EnergyEndpointer::~EnergyEndpointer() {
148 int EnergyEndpointer::TimeToFrame(float time
) const {
149 return static_cast<int32
>(0.5 + (time
/ params_
.frame_period()));
152 void EnergyEndpointer::Restart(bool reset_threshold
) {
153 status_
= EP_PRE_SPEECH
;
154 user_input_start_time_us_
= 0;
156 if (reset_threshold
) {
157 decision_threshold_
= params_
.decision_threshold();
158 rms_adapt_
= decision_threshold_
;
159 noise_level_
= params_
.decision_threshold() / 2.0f
;
160 frame_counter_
= 0; // Used for rapid initial update of levels.
163 // Set up the memories to hold the history windows.
164 history_
->SetRing(TimeToFrame(max_window_dur_
), false);
166 // Flag that indicates that current input should be used for
167 // estimating the environment. The user has not yet started input
168 // by e.g. pressed the push-to-talk button. By default, this is
169 // false for backward compatibility.
170 estimating_environment_
= false;
173 void EnergyEndpointer::Init(const EnergyEndpointerParams
& params
) {
176 // Find the longest history interval to be used, and make the ring
177 // large enough to accommodate that number of frames. NOTE: This
178 // depends upon ep_frame_period being set correctly in the factory
179 // that did this instantiation.
180 max_window_dur_
= params_
.onset_window();
181 if (params_
.speech_on_window() > max_window_dur_
)
182 max_window_dur_
= params_
.speech_on_window();
183 if (params_
.offset_window() > max_window_dur_
)
184 max_window_dur_
= params_
.offset_window();
187 offset_confirm_dur_sec_
= params_
.offset_window() -
188 params_
.offset_confirm_dur();
189 if (offset_confirm_dur_sec_
< 0.0)
190 offset_confirm_dur_sec_
= 0.0;
192 user_input_start_time_us_
= 0;
194 // Flag that indicates that current input should be used for
195 // estimating the environment. The user has not yet started input
196 // by e.g. pressed the push-to-talk button. By default, this is
197 // false for backward compatibility.
198 estimating_environment_
= false;
199 // The initial value of the noise and speech levels is inconsequential.
200 // The level of the first frame will overwrite these values.
201 noise_level_
= params_
.decision_threshold() / 2.0f
;
202 fast_update_frames_
=
203 static_cast<int64
>(params_
.fast_update_dur() / params_
.frame_period());
205 frame_counter_
= 0; // Used for rapid initial update of levels.
207 sample_rate_
= params_
.sample_rate();
208 start_lag_
= static_cast<int>(sample_rate_
/
209 params_
.max_fundamental_frequency());
210 end_lag_
= static_cast<int>(sample_rate_
/
211 params_
.min_fundamental_frequency());
214 void EnergyEndpointer::StartSession() {
218 void EnergyEndpointer::EndSession() {
219 status_
= EP_POST_SPEECH
;
222 void EnergyEndpointer::SetEnvironmentEstimationMode() {
224 estimating_environment_
= true;
227 void EnergyEndpointer::SetUserInputMode() {
228 estimating_environment_
= false;
229 user_input_start_time_us_
= endpointer_time_us_
;
232 void EnergyEndpointer::ProcessAudioFrame(int64 time_us
,
233 const int16
* samples
,
236 endpointer_time_us_
= time_us
;
237 float rms
= RMS(samples
, num_samples
);
239 // Check that this is user input audio vs. pre-input adaptation audio.
240 // Input audio starts when the user indicates start of input, by e.g.
241 // pressing push-to-talk. Audio received prior to that is used to update
242 // noise and speech level estimates.
243 if (!estimating_environment_
) {
244 bool decision
= false;
245 if ((endpointer_time_us_
- user_input_start_time_us_
) <
246 Secs2Usecs(params_
.contamination_rejection_period())) {
248 DVLOG(1) << "decision: forced to false, time: " << endpointer_time_us_
;
250 decision
= (rms
> decision_threshold_
);
253 history_
->Insert(endpointer_time_us_
, decision
);
257 if (history_
->RingSum(params_
.onset_window()) >
258 params_
.onset_detect_dur()) {
259 status_
= EP_POSSIBLE_ONSET
;
263 case EP_POSSIBLE_ONSET
: {
264 float tsum
= history_
->RingSum(params_
.onset_window());
265 if (tsum
> params_
.onset_confirm_dur()) {
266 status_
= EP_SPEECH_PRESENT
;
267 } else { // If signal is not maintained, drop back to pre-speech.
268 if (tsum
<= params_
.onset_detect_dur())
269 status_
= EP_PRE_SPEECH
;
274 case EP_SPEECH_PRESENT
: {
275 // To induce hysteresis in the state residency, we allow a
276 // smaller residency time in the on_ring, than was required to
277 // enter the SPEECH_PERSENT state.
278 float on_time
= history_
->RingSum(params_
.speech_on_window());
279 if (on_time
< params_
.on_maintain_dur())
280 status_
= EP_POSSIBLE_OFFSET
;
284 case EP_POSSIBLE_OFFSET
:
285 if (history_
->RingSum(params_
.offset_window()) <=
286 offset_confirm_dur_sec_
) {
287 // Note that this offset time may be beyond the end
288 // of the input buffer in a real-time system. It will be up
289 // to the RecognizerSession to decide what to do.
290 status_
= EP_PRE_SPEECH
; // Automatically reset for next utterance.
291 } else { // If speech picks up again we allow return to SPEECH_PRESENT.
292 if (history_
->RingSum(params_
.speech_on_window()) >=
293 params_
.on_maintain_dur())
294 status_
= EP_SPEECH_PRESENT
;
299 LOG(WARNING
) << "Invalid case in switch: " << status_
;
303 // If this is a quiet, non-speech region, slowly adapt the detection
304 // threshold to be about 6dB above the average RMS.
305 if ((!decision
) && (status_
== EP_PRE_SPEECH
)) {
306 decision_threshold_
= (0.98f
* decision_threshold_
) + (0.02f
* 2 * rms
);
307 rms_adapt_
= decision_threshold_
;
309 // If this is in a speech region, adapt the decision threshold to
310 // be about 10dB below the average RMS. If the noise level is high,
311 // the threshold is pushed up.
312 // Adaptation up to a higher level is 5 times faster than decay to
314 if ((status_
== EP_SPEECH_PRESENT
) && decision
) {
315 if (rms_adapt_
> rms
) {
316 rms_adapt_
= (0.99f
* rms_adapt_
) + (0.01f
* rms
);
318 rms_adapt_
= (0.95f
* rms_adapt_
) + (0.05f
* rms
);
320 float target_threshold
= 0.3f
* rms_adapt_
+ noise_level_
;
321 decision_threshold_
= (.90f
* decision_threshold_
) +
322 (0.10f
* target_threshold
);
327 if (decision_threshold_
< params_
.min_decision_threshold())
328 decision_threshold_
= params_
.min_decision_threshold();
331 // Update speech and noise levels.
336 *rms_out
= GetDecibel(rms
);
339 float EnergyEndpointer::GetNoiseLevelDb() const {
340 return GetDecibel(noise_level_
);
343 void EnergyEndpointer::UpdateLevels(float rms
) {
344 // Update quickly initially. We assume this is noise and that
345 // speech is 6dB above the noise.
346 if (frame_counter_
< fast_update_frames_
) {
347 // Alpha increases from 0 to (k-1)/k where k is the number of time
348 // steps in the initial adaptation period.
349 float alpha
= static_cast<float>(frame_counter_
) /
350 static_cast<float>(fast_update_frames_
);
351 noise_level_
= (alpha
* noise_level_
) + ((1 - alpha
) * rms
);
352 DVLOG(1) << "FAST UPDATE, frame_counter_ " << frame_counter_
353 << ", fast_update_frames_ " << fast_update_frames_
;
355 // Update Noise level. The noise level adapts quickly downward, but
356 // slowly upward. The noise_level_ parameter is not currently used
357 // for threshold adaptation. It is used for UI feedback.
358 if (noise_level_
< rms
)
359 noise_level_
= (0.999f
* noise_level_
) + (0.001f
* rms
);
361 noise_level_
= (0.95f
* noise_level_
) + (0.05f
* rms
);
363 if (estimating_environment_
|| (frame_counter_
< fast_update_frames_
)) {
364 decision_threshold_
= noise_level_
* 2; // 6dB above noise level.
366 if (decision_threshold_
< params_
.min_decision_threshold())
367 decision_threshold_
= params_
.min_decision_threshold();
371 EpStatus
EnergyEndpointer::Status(int64
* status_time
) const {
372 *status_time
= history_
->EndTime();
376 } // namespace content