1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
3 // Redistribution and use in source and binary forms, with or without
4 // modification, are permitted provided that the following conditions are
7 // * Redistributions of source code must retain the above copyright
8 // notice, this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above
10 // copyright notice, this list of conditions and the following disclaimer
11 // in the documentation and/or other materials provided with the
13 // * Neither the name of Google Inc. nor the names of its
14 // contributors may be used to endorse or promote products derived from
15 // this software without specific prior written permission.
17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 // The EnergyEndpointer class finds likely speech onset and offset points.
31 // The implementation described here is about the simplest possible.
32 // It is based on timings of threshold crossings for overall signal
33 // RMS. It is suitable for light weight applications.
35 // As written, the basic idea is that one specifies intervals that
36 // must be occupied by super- and sub-threshold energy levels, and
37 // defers decisions re onset and offset times until these
38 // specifications have been met. Three basic intervals are tested: an
39 // onset window, a speech-on window, and an offset window. We require
40 // super-threshold to exceed some mimimum total durations in the onset
41 // and speech-on windows before declaring the speech onset time, and
42 // we specify a required sub-threshold residency in the offset window
43 // before declaring speech offset. As the various residency requirements are
44 // met, the EnergyEndpointer instance assumes various states, and can return the
45 // ID of these states to the client (see EpStatus below).
47 // The levels of the speech and background noise are continuously updated. It is
48 // important that the background noise level be estimated initially for
49 // robustness in noisy conditions. The first frames are assumed to be background
50 // noise and a fast update rate is used for the noise level. The duration for
51 // fast update is controlled by the fast_update_dur_ paramter.
53 // If used in noisy conditions, the endpointer should be started and run in the
54 // EnvironmentEstimation mode, for at least 200ms, before switching to
56 // Audio feedback contamination can appear in the input audio, if not cut
57 // out or handled by echo cancellation. Audio feedback can trigger a false
58 // accept. The false accepts can be ignored by setting
59 // ep_contamination_rejection_period.
61 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
62 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
66 #include "mozilla/UniquePtr.h"
68 #include "energy_endpointer_params.h"
72 // Endpointer status codes
81 class EnergyEndpointer
{
83 // The default construction MUST be followed by Init(), before any
84 // other use can be made of the instance.
86 virtual ~EnergyEndpointer();
88 void Init(const EnergyEndpointerParams
& params
);
90 // Start the endpointer. This should be called at the beginning of a session.
93 // Stop the endpointer.
96 // Start environment estimation. Audio will be used for environment estimation
97 // i.e. noise level estimation.
98 void SetEnvironmentEstimationMode();
100 // Start user input. This should be called when the user indicates start of
101 // input, e.g. by pressing a button.
102 void SetUserInputMode();
104 // Computes the next input frame and modifies EnergyEndpointer status as
105 // appropriate based on the computation.
106 void ProcessAudioFrame(int64_t time_us
,
107 const int16_t* samples
, int num_samples
,
110 // Returns the current state of the EnergyEndpointer and the time
111 // corresponding to the most recently computed frame.
112 EpStatus
Status(int64_t* status_time_us
) const;
114 bool estimating_environment() const {
115 return estimating_environment_
;
118 // Returns estimated noise level in dB.
119 float GetNoiseLevelDb() const;
124 // Resets the endpointer internal state. If reset_threshold is true, the
125 // state will be reset completely, including adaptive thresholds and the
126 // removal of all history information.
127 void Restart(bool reset_threshold
);
129 // Update internal speech and noise levels.
130 void UpdateLevels(float rms
);
132 // Returns the number of frames (or frame number) corresponding to
133 // the 'time' (in seconds).
134 int TimeToFrame(float time
) const;
136 EpStatus status_
; // The current state of this instance.
137 float offset_confirm_dur_sec_
; // max on time allowed to confirm POST_SPEECH
138 int64_t endpointer_time_us_
; // Time of the most recently received audio frame.
139 int64_t fast_update_frames_
; // Number of frames for initial level adaptation.
140 int64_t frame_counter_
; // Number of frames seen. Used for initial adaptation.
141 float max_window_dur_
; // Largest search window size (seconds)
142 float sample_rate_
; // Sampling rate.
144 // Ring buffers to hold the speech activity history.
145 UniquePtr
<HistoryRing
> history_
;
147 // Configuration parameters.
148 EnergyEndpointerParams params_
;
150 // RMS which must be exceeded to conclude frame is speech.
151 float decision_threshold_
;
153 // Flag to indicate that audio should be used to estimate environment, prior
154 // to receiving user input.
155 bool estimating_environment_
;
157 // Estimate of the background noise level. Used externally for UI feedback.
160 // An adaptive threshold used to update decision_threshold_ when appropriate.
163 // Start lag corresponds to the highest fundamental frequency.
166 // End lag corresponds to the lowest fundamental frequency.
169 // Time when mode switched from environment estimation to user input. This
170 // is used to time forced rejection of audio feedback contamination.
171 int64_t user_input_start_time_us_
;
173 // prevent copy constructor and assignment
174 EnergyEndpointer(const EnergyEndpointer
&);
175 void operator=(const EnergyEndpointer
&);
178 } // namespace mozilla
180 #endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_