Bug 1945643 - Update to mozilla-nimbus-schemas 2025.1.1 r=chumphreys
[gecko.git] / dom / media / webspeech / recognition / endpointer.h
blob7879d6b9f3b770b07a100b037bf37e0cf239602a
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 //
3 // Redistribution and use in source and binary forms, with or without
4 // modification, are permitted provided that the following conditions are
5 // met:
6 //
7 // * Redistributions of source code must retain the above copyright
8 // notice, this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above
10 // copyright notice, this list of conditions and the following disclaimer
11 // in the documentation and/or other materials provided with the
12 // distribution.
13 // * Neither the name of Google Inc. nor the names of its
14 // contributors may be used to endorse or promote products derived from
15 // this software without specific prior written permission.
17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
30 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
32 #include "energy_endpointer.h"
34 namespace mozilla {
36 struct AudioChunk;
38 // A simple interface to the underlying energy-endpointer implementation, this
39 // class lets callers provide audio as being recorded and let them poll to find
40 // when the user has stopped speaking.
42 // There are two events that may trigger the end of speech:
44 // speechInputPossiblyComplete event:
46 // Signals that silence/noise has been detected for a *short* amount of
47 // time after some speech has been detected. It can be used for low latency
48 // UI feedback. To disable it, set it to a large amount.
50 // speechInputComplete event:
52 // This event is intended to signal end of input and to stop recording.
53 // The amount of time to wait after speech is set by
54 // speech_input_complete_silence_length_ and optionally two other
55 // parameters (see below).
56 // This time can be held constant, or can change as more speech is detected.
57 // In the latter case, the time changes after a set amount of time from the
58 // *beginning* of speech. This is motivated by the expectation that there
59 // will be two distinct types of inputs: short search queries and longer
60 // dictation style input.
62 // Three parameters are used to define the piecewise constant timeout function.
63 // The timeout length is speech_input_complete_silence_length until
64 // long_speech_length, when it changes to
65 // long_speech_input_complete_silence_length.
66 class Endpointer {
67 public:
68 explicit Endpointer(int sample_rate);
70 // Start the endpointer. This should be called at the beginning of a session.
71 void StartSession();
73 // Stop the endpointer.
74 void EndSession();
76 // Start environment estimation. Audio will be used for environment estimation
77 // i.e. noise level estimation.
78 void SetEnvironmentEstimationMode();
80 // Start user input. This should be called when the user indicates start of
81 // input, e.g. by pressing a button.
82 void SetUserInputMode();
84 // Process a segment of audio, which may be more than one frame.
85 // The status of the last frame will be returned.
86 EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out);
88 // Get the status of the endpointer.
89 EpStatus Status(int64_t *time_us);
91 // Get the expected frame size for audio chunks. Audio chunks are expected
92 // to contain a number of samples that is a multiple of this number, and extra
93 // samples will be dropped.
94 int32_t FrameSize() const {
95 return frame_size_;
98 // Returns true if the endpointer detected reasonable audio levels above
99 // background noise which could be user speech, false if not.
100 bool DidStartReceivingSpeech() const {
101 return speech_previously_detected_;
104 bool IsEstimatingEnvironment() const {
105 return energy_endpointer_.estimating_environment();
108 void set_speech_input_complete_silence_length(int64_t time_us) {
109 speech_input_complete_silence_length_us_ = time_us;
112 void set_long_speech_input_complete_silence_length(int64_t time_us) {
113 long_speech_input_complete_silence_length_us_ = time_us;
116 void set_speech_input_possibly_complete_silence_length(int64_t time_us) {
117 speech_input_possibly_complete_silence_length_us_ = time_us;
120 void set_long_speech_length(int64_t time_us) {
121 long_speech_length_us_ = time_us;
124 bool speech_input_complete() const {
125 return speech_input_complete_;
128 // RMS background noise level in dB.
129 float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); }
131 private:
132 // Reset internal states. Helper method common to initial input utterance
133 // and following input utternaces.
134 void Reset();
136 // Minimum allowable length of speech input.
137 int64_t speech_input_minimum_length_us_;
139 // The speechInputPossiblyComplete event signals that silence/noise has been
140 // detected for a *short* amount of time after some speech has been detected.
141 // This proporty specifies the time period.
142 int64_t speech_input_possibly_complete_silence_length_us_;
144 // The speechInputComplete event signals that silence/noise has been
145 // detected for a *long* amount of time after some speech has been detected.
146 // This property specifies the time period.
147 int64_t speech_input_complete_silence_length_us_;
149 // Same as above, this specifies the required silence period after speech
150 // detection. This period is used instead of
151 // speech_input_complete_silence_length_ when the utterance is longer than
152 // long_speech_length_. This parameter is optional.
153 int64_t long_speech_input_complete_silence_length_us_;
155 // The period of time after which the endpointer should consider
156 // long_speech_input_complete_silence_length_ as a valid silence period
157 // instead of speech_input_complete_silence_length_. This parameter is
158 // optional.
159 int64_t long_speech_length_us_;
161 // First speech onset time, used in determination of speech complete timeout.
162 int64_t speech_start_time_us_;
164 // Most recent end time, used in determination of speech complete timeout.
165 int64_t speech_end_time_us_;
167 int64_t audio_frame_time_us_;
168 EpStatus old_ep_status_;
169 bool waiting_for_speech_possibly_complete_timeout_;
170 bool waiting_for_speech_complete_timeout_;
171 bool speech_previously_detected_;
172 bool speech_input_complete_;
173 EnergyEndpointer energy_endpointer_;
174 int sample_rate_;
175 int32_t frame_size_;
178 } // namespace mozilla
180 #endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_