Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / content / browser / speech / endpointer / endpointer.h
blob6688ee63dc98567ffe3cd45454655dbee6d9ac1d
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
6 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
8 #include "base/basictypes.h"
9 #include "content/browser/speech/endpointer/energy_endpointer.h"
10 #include "content/common/content_export.h"
12 class EpStatus;
14 namespace content {
16 class AudioChunk;
18 // A simple interface to the underlying energy-endpointer implementation, this
19 // class lets callers provide audio as being recorded and let them poll to find
20 // when the user has stopped speaking.
22 // There are two events that may trigger the end of speech:
24 // speechInputPossiblyComplete event:
26 // Signals that silence/noise has been detected for a *short* amount of
27 // time after some speech has been detected. It can be used for low latency
28 // UI feedback. To disable it, set it to a large amount.
30 // speechInputComplete event:
32 // This event is intended to signal end of input and to stop recording.
33 // The amount of time to wait after speech is set by
34 // speech_input_complete_silence_length_ and optionally two other
35 // parameters (see below).
36 // This time can be held constant, or can change as more speech is detected.
37 // In the latter case, the time changes after a set amount of time from the
38 // *beginning* of speech. This is motivated by the expectation that there
39 // will be two distinct types of inputs: short search queries and longer
40 // dictation style input.
42 // Three parameters are used to define the piecewise constant timeout function.
43 // The timeout length is speech_input_complete_silence_length until
44 // long_speech_length, when it changes to
45 // long_speech_input_complete_silence_length.
46 class CONTENT_EXPORT Endpointer {
47 public:
48 explicit Endpointer(int sample_rate);
50 // Start the endpointer. This should be called at the beginning of a session.
51 void StartSession();
53 // Stop the endpointer.
54 void EndSession();
56 // Start environment estimation. Audio will be used for environment estimation
57 // i.e. noise level estimation.
58 void SetEnvironmentEstimationMode();
60 // Start user input. This should be called when the user indicates start of
61 // input, e.g. by pressing a button.
62 void SetUserInputMode();
64 // Process a segment of audio, which may be more than one frame.
65 // The status of the last frame will be returned.
66 EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out);
68 // Get the status of the endpointer.
69 EpStatus Status(int64 *time_us);
71 // Returns true if the endpointer detected reasonable audio levels above
72 // background noise which could be user speech, false if not.
73 bool DidStartReceivingSpeech() const {
74 return speech_previously_detected_;
77 bool IsEstimatingEnvironment() const {
78 return energy_endpointer_.estimating_environment();
81 void set_speech_input_complete_silence_length(int64 time_us) {
82 speech_input_complete_silence_length_us_ = time_us;
85 void set_long_speech_input_complete_silence_length(int64 time_us) {
86 long_speech_input_complete_silence_length_us_ = time_us;
89 void set_speech_input_possibly_complete_silence_length(int64 time_us) {
90 speech_input_possibly_complete_silence_length_us_ = time_us;
93 void set_long_speech_length(int64 time_us) {
94 long_speech_length_us_ = time_us;
97 bool speech_input_complete() const {
98 return speech_input_complete_;
101 // RMS background noise level in dB.
102 float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); }
104 private:
105 // Reset internal states. Helper method common to initial input utterance
106 // and following input utternaces.
107 void Reset();
109 // Minimum allowable length of speech input.
110 int64 speech_input_minimum_length_us_;
112 // The speechInputPossiblyComplete event signals that silence/noise has been
113 // detected for a *short* amount of time after some speech has been detected.
114 // This proporty specifies the time period.
115 int64 speech_input_possibly_complete_silence_length_us_;
117 // The speechInputComplete event signals that silence/noise has been
118 // detected for a *long* amount of time after some speech has been detected.
119 // This property specifies the time period.
120 int64 speech_input_complete_silence_length_us_;
122 // Same as above, this specifies the required silence period after speech
123 // detection. This period is used instead of
124 // speech_input_complete_silence_length_ when the utterance is longer than
125 // long_speech_length_. This parameter is optional.
126 int64 long_speech_input_complete_silence_length_us_;
128 // The period of time after which the endpointer should consider
129 // long_speech_input_complete_silence_length_ as a valid silence period
130 // instead of speech_input_complete_silence_length_. This parameter is
131 // optional.
132 int64 long_speech_length_us_;
134 // First speech onset time, used in determination of speech complete timeout.
135 int64 speech_start_time_us_;
137 // Most recent end time, used in determination of speech complete timeout.
138 int64 speech_end_time_us_;
140 int64 audio_frame_time_us_;
141 EpStatus old_ep_status_;
142 bool waiting_for_speech_possibly_complete_timeout_;
143 bool waiting_for_speech_complete_timeout_;
144 bool speech_previously_detected_;
145 bool speech_input_complete_;
146 EnergyEndpointer energy_endpointer_;
147 int sample_rate_;
148 int32 frame_size_;
151 } // namespace content
153 #endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_