1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // The EnergyEndpointer class finds likely speech onset and offset points.
7 // The implementation described here is about the simplest possible.
8 // It is based on timings of threshold crossings for overall signal
9 // RMS. It is suitable for light weight applications.
11 // As written, the basic idea is that one specifies intervals that
12 // must be occupied by super- and sub-threshold energy levels, and
13 // defers decisions re onset and offset times until these
14 // specifications have been met. Three basic intervals are tested: an
15 // onset window, a speech-on window, and an offset window. We require
16 // super-threshold to exceed some mimimum total durations in the onset
17 // and speech-on windows before declaring the speech onset time, and
18 // we specify a required sub-threshold residency in the offset window
19 // before declaring speech offset. As the various residency requirements are
20 // met, the EnergyEndpointer instance assumes various states, and can return the
21 // ID of these states to the client (see EpStatus below).
23 // The levels of the speech and background noise are continuously updated. It is
24 // important that the background noise level be estimated initially for
25 // robustness in noisy conditions. The first frames are assumed to be background
26 // noise and a fast update rate is used for the noise level. The duration for
27 // fast update is controlled by the fast_update_dur_ paramter.
29 // If used in noisy conditions, the endpointer should be started and run in the
30 // EnvironmentEstimation mode, for at least 200ms, before switching to
32 // Audio feedback contamination can appear in the input audio, if not cut
33 // out or handled by echo cancellation. Audio feedback can trigger a false
34 // accept. The false accepts can be ignored by setting
35 // ep_contamination_rejection_period.
37 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
38 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
42 #include "base/basictypes.h"
43 #include "base/memory/scoped_ptr.h"
44 #include "content/browser/speech/endpointer/energy_endpointer_params.h"
45 #include "content/common/content_export.h"
49 // Endpointer status codes
58 class CONTENT_EXPORT EnergyEndpointer
{
60 // The default construction MUST be followed by Init(), before any
61 // other use can be made of the instance.
63 virtual ~EnergyEndpointer();
65 void Init(const EnergyEndpointerParams
& params
);
67 // Start the endpointer. This should be called at the beginning of a session.
70 // Stop the endpointer.
73 // Start environment estimation. Audio will be used for environment estimation
74 // i.e. noise level estimation.
75 void SetEnvironmentEstimationMode();
77 // Start user input. This should be called when the user indicates start of
78 // input, e.g. by pressing a button.
79 void SetUserInputMode();
81 // Computes the next input frame and modifies EnergyEndpointer status as
82 // appropriate based on the computation.
83 void ProcessAudioFrame(int64 time_us
,
84 const int16
* samples
, int num_samples
,
87 // Returns the current state of the EnergyEndpointer and the time
88 // corresponding to the most recently computed frame.
89 EpStatus
Status(int64
* status_time_us
) const;
91 bool estimating_environment() const {
92 return estimating_environment_
;
95 // Returns estimated noise level in dB.
96 float GetNoiseLevelDb() const;
101 // Resets the endpointer internal state. If reset_threshold is true, the
102 // state will be reset completely, including adaptive thresholds and the
103 // removal of all history information.
104 void Restart(bool reset_threshold
);
106 // Update internal speech and noise levels.
107 void UpdateLevels(float rms
);
109 // Returns the number of frames (or frame number) corresponding to
110 // the 'time' (in seconds).
111 int TimeToFrame(float time
) const;
113 EpStatus status_
; // The current state of this instance.
114 float offset_confirm_dur_sec_
; // max on time allowed to confirm POST_SPEECH
115 int64 endpointer_time_us_
; // Time of the most recently received audio frame.
116 int64 fast_update_frames_
; // Number of frames for initial level adaptation.
117 int64 frame_counter_
; // Number of frames seen. Used for initial adaptation.
118 float max_window_dur_
; // Largest search window size (seconds)
119 float sample_rate_
; // Sampling rate.
121 // Ring buffers to hold the speech activity history.
122 scoped_ptr
<HistoryRing
> history_
;
124 // Configuration parameters.
125 EnergyEndpointerParams params_
;
127 // RMS which must be exceeded to conclude frame is speech.
128 float decision_threshold_
;
130 // Flag to indicate that audio should be used to estimate environment, prior
131 // to receiving user input.
132 bool estimating_environment_
;
134 // Estimate of the background noise level. Used externally for UI feedback.
137 // An adaptive threshold used to update decision_threshold_ when appropriate.
140 // Start lag corresponds to the highest fundamental frequency.
143 // End lag corresponds to the lowest fundamental frequency.
146 // Time when mode switched from environment estimation to user input. This
147 // is used to time forced rejection of audio feedback contamination.
148 int64 user_input_start_time_us_
;
150 DISALLOW_COPY_AND_ASSIGN(EnergyEndpointer
);
153 } // namespace content
155 #endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_