1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
6 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
8 #include "base/basictypes.h"
9 #include "content/browser/speech/endpointer/energy_endpointer.h"
10 #include "content/common/content_export.h"
18 // A simple interface to the underlying energy-endpointer implementation, this
19 // class lets callers provide audio as being recorded and let them poll to find
20 // when the user has stopped speaking.
22 // There are two events that may trigger the end of speech:
24 // speechInputPossiblyComplete event:
26 // Signals that silence/noise has been detected for a *short* amount of
27 // time after some speech has been detected. It can be used for low latency
28 // UI feedback. To disable it, set it to a large amount.
30 // speechInputComplete event:
32 // This event is intended to signal end of input and to stop recording.
33 // The amount of time to wait after speech is set by
34 // speech_input_complete_silence_length_ and optionally two other
35 // parameters (see below).
36 // This time can be held constant, or can change as more speech is detected.
37 // In the latter case, the time changes after a set amount of time from the
38 // *beginning* of speech. This is motivated by the expectation that there
39 // will be two distinct types of inputs: short search queries and longer
40 // dictation style input.
42 // Three parameters are used to define the piecewise constant timeout function.
43 // The timeout length is speech_input_complete_silence_length until
44 // long_speech_length, when it changes to
45 // long_speech_input_complete_silence_length.
46 class CONTENT_EXPORT Endpointer
{
48 explicit Endpointer(int sample_rate
);
50 // Start the endpointer. This should be called at the beginning of a session.
53 // Stop the endpointer.
56 // Start environment estimation. Audio will be used for environment estimation
57 // i.e. noise level estimation.
58 void SetEnvironmentEstimationMode();
60 // Start user input. This should be called when the user indicates start of
61 // input, e.g. by pressing a button.
62 void SetUserInputMode();
64 // Process a segment of audio, which may be more than one frame.
65 // The status of the last frame will be returned.
66 EpStatus
ProcessAudio(const AudioChunk
& raw_audio
, float* rms_out
);
68 // Get the status of the endpointer.
69 EpStatus
Status(int64
*time_us
);
71 // Returns true if the endpointer detected reasonable audio levels above
72 // background noise which could be user speech, false if not.
73 bool DidStartReceivingSpeech() const {
74 return speech_previously_detected_
;
77 bool IsEstimatingEnvironment() const {
78 return energy_endpointer_
.estimating_environment();
81 void set_speech_input_complete_silence_length(int64 time_us
) {
82 speech_input_complete_silence_length_us_
= time_us
;
85 void set_long_speech_input_complete_silence_length(int64 time_us
) {
86 long_speech_input_complete_silence_length_us_
= time_us
;
89 void set_speech_input_possibly_complete_silence_length(int64 time_us
) {
90 speech_input_possibly_complete_silence_length_us_
= time_us
;
93 void set_long_speech_length(int64 time_us
) {
94 long_speech_length_us_
= time_us
;
97 bool speech_input_complete() const {
98 return speech_input_complete_
;
101 // RMS background noise level in dB.
102 float NoiseLevelDb() const { return energy_endpointer_
.GetNoiseLevelDb(); }
105 // Reset internal states. Helper method common to initial input utterance
106 // and following input utternaces.
109 // Minimum allowable length of speech input.
110 int64 speech_input_minimum_length_us_
;
112 // The speechInputPossiblyComplete event signals that silence/noise has been
113 // detected for a *short* amount of time after some speech has been detected.
114 // This proporty specifies the time period.
115 int64 speech_input_possibly_complete_silence_length_us_
;
117 // The speechInputComplete event signals that silence/noise has been
118 // detected for a *long* amount of time after some speech has been detected.
119 // This property specifies the time period.
120 int64 speech_input_complete_silence_length_us_
;
122 // Same as above, this specifies the required silence period after speech
123 // detection. This period is used instead of
124 // speech_input_complete_silence_length_ when the utterance is longer than
125 // long_speech_length_. This parameter is optional.
126 int64 long_speech_input_complete_silence_length_us_
;
128 // The period of time after which the endpointer should consider
129 // long_speech_input_complete_silence_length_ as a valid silence period
130 // instead of speech_input_complete_silence_length_. This parameter is
132 int64 long_speech_length_us_
;
134 // First speech onset time, used in determination of speech complete timeout.
135 int64 speech_start_time_us_
;
137 // Most recent end time, used in determination of speech complete timeout.
138 int64 speech_end_time_us_
;
140 int64 audio_frame_time_us_
;
141 EpStatus old_ep_status_
;
142 bool waiting_for_speech_possibly_complete_timeout_
;
143 bool waiting_for_speech_complete_timeout_
;
144 bool speech_previously_detected_
;
145 bool speech_input_complete_
;
146 EnergyEndpointer energy_endpointer_
;
151 } // namespace content
153 #endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_