1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "content/renderer/media/speech_recognition_audio_sink.h"
8 #include "base/strings/utf_string_conversions.h"
9 #include "content/renderer/media/media_stream_audio_source.h"
10 #include "content/renderer/media/mock_media_constraint_factory.h"
11 #include "content/renderer/media/webrtc/webrtc_local_audio_track_adapter.h"
12 #include "content/renderer/media/webrtc_local_audio_track.h"
13 #include "media/audio/audio_parameters.h"
14 #include "media/base/audio_bus.h"
15 #include "testing/gmock/include/gmock/gmock.h"
16 #include "testing/gtest/include/gtest/gtest.h"
17 #include "third_party/WebKit/public/platform/WebMediaStreamTrack.h"
21 // Supported speech recognition audio parameters.
22 const int kSpeechRecognitionSampleRate
= 16000;
23 const int kSpeechRecognitionFramesPerBuffer
= 1600;
25 // Input audio format.
26 const media::AudioParameters::Format kInputFormat
=
27 media::AudioParameters::AUDIO_PCM_LOW_LATENCY
;
28 const media::ChannelLayout kInputChannelLayout
= media::CHANNEL_LAYOUT_MONO
;
29 const int kInputChannels
= 1;
30 const int kInputBitsPerSample
= 16;
32 // Output audio format.
33 const media::AudioParameters::Format kOutputFormat
=
34 media::AudioParameters::AUDIO_PCM_LOW_LATENCY
;
35 const media::ChannelLayout kOutputChannelLayout
= media::CHANNEL_LAYOUT_STEREO
;
36 const int kOutputChannels
= 2;
37 const int kOutputBitsPerSample
= 16;
39 // Mocked out sockets used for Send/Receive.
40 // Data is written and read from a shared buffer used as a FIFO and there is
41 // no blocking. |OnSendCB| is used to trigger a |Receive| on the other socket.
42 class MockSyncSocket
: public base::SyncSocket
{
44 // This allows for 2 requests in queue between the |MockSyncSocket|s.
45 static const int kSharedBufferSize
= 8;
47 // Buffer to be shared between two |MockSyncSocket|s. Allocated on heap.
49 SharedBuffer() : data(), start(0), length(0) {}
51 uint8 data
[kSharedBufferSize
];
56 // Callback used for pairing an A.Send() with B.Receieve() without blocking.
57 typedef base::Callback
<void()> OnSendCB
;
59 explicit MockSyncSocket(SharedBuffer
* shared_buffer
)
60 : buffer_(shared_buffer
),
61 in_failure_mode_(false) {}
63 MockSyncSocket(SharedBuffer
* shared_buffer
, const OnSendCB
& on_send_cb
)
64 : buffer_(shared_buffer
),
65 on_send_cb_(on_send_cb
),
66 in_failure_mode_(false) {}
68 size_t Send(const void* buffer
, size_t length
) override
;
69 size_t Receive(void* buffer
, size_t length
) override
;
71 // When |in_failure_mode_| == true, the socket fails to send.
72 void SetFailureMode(bool in_failure_mode
) {
73 in_failure_mode_
= in_failure_mode
;
77 SharedBuffer
* buffer_
;
78 const OnSendCB on_send_cb_
;
79 bool in_failure_mode_
;
81 DISALLOW_COPY_AND_ASSIGN(MockSyncSocket
);
84 // base::SyncSocket implementation
85 size_t MockSyncSocket::Send(const void* buffer
, size_t length
) {
89 const uint8
* b
= static_cast<const uint8
*>(buffer
);
90 for (size_t i
= 0; i
< length
; ++i
, ++buffer_
->length
)
91 buffer_
->data
[buffer_
->start
+ buffer_
->length
] = b
[i
];
97 size_t MockSyncSocket::Receive(void* buffer
, size_t length
) {
98 uint8
* b
= static_cast<uint8
*>(buffer
);
99 for (size_t i
= buffer_
->start
; i
< buffer_
->length
; ++i
, ++buffer_
->start
)
100 b
[i
] = buffer_
->data
[buffer_
->start
];
102 // Since buffer is used sequentially, we can reset the buffer indices here.
103 buffer_
->start
= buffer_
->length
= 0;
107 // This fake class is the consumer used to verify behaviour of the producer.
108 // The |Initialize()| method shows what the consumer should be responsible for
109 // in the production code (minus the mocks).
110 class FakeSpeechRecognizer
{
112 FakeSpeechRecognizer() : is_responsive_(true) {}
115 const blink::WebMediaStreamTrack
& track
,
116 const media::AudioParameters
& sink_params
,
117 base::SharedMemoryHandle
* foreign_memory_handle
) {
118 // Shared memory is allocated, mapped and shared.
119 const uint32 kSharedMemorySize
=
120 sizeof(media::AudioInputBufferParameters
) +
121 media::AudioBus::CalculateMemorySize(sink_params
);
122 shared_memory_
.reset(new base::SharedMemory());
123 ASSERT_TRUE(shared_memory_
->CreateAndMapAnonymous(kSharedMemorySize
));
124 memset(shared_memory_
->memory(), 0, kSharedMemorySize
);
125 ASSERT_TRUE(shared_memory_
->ShareToProcess(base::GetCurrentProcessHandle(),
126 foreign_memory_handle
));
128 // Wrap the shared memory for the audio bus.
129 media::AudioInputBuffer
* buffer
=
130 static_cast<media::AudioInputBuffer
*>(shared_memory_
->memory());
132 audio_track_bus_
= media::AudioBus::WrapMemory(sink_params
, buffer
->audio
);
133 audio_track_bus_
->Zero();
135 // Reference to the counter used to synchronize.
136 buffer
->params
.size
= 0U;
138 // Create a shared buffer for the |MockSyncSocket|s.
139 shared_buffer_
.reset(new MockSyncSocket::SharedBuffer());
141 // Local socket will receive signals from the producer.
142 receiving_socket_
.reset(new MockSyncSocket(shared_buffer_
.get()));
144 // We automatically trigger a Receive when data is sent over the socket.
145 sending_socket_
= new MockSyncSocket(
146 shared_buffer_
.get(),
147 base::Bind(&FakeSpeechRecognizer::EmulateReceiveThreadLoopIteration
,
148 base::Unretained(this)));
150 // This is usually done to pair the sockets. Here it's not effective.
151 base::SyncSocket::CreatePair(receiving_socket_
.get(), sending_socket_
);
154 // Emulates a single iteraton of a thread receiving on the socket.
155 // This would normally be done on a receiving thread's task on the browser.
156 void EmulateReceiveThreadLoopIteration() {
160 const int kSize
= sizeof(media::AudioInputBufferParameters().size
);
161 receiving_socket_
->Receive(&(GetAudioInputBuffer()->params
.size
), kSize
);
163 // Notify the producer that the audio buffer has been consumed.
164 GetAudioInputBuffer()->params
.size
++;
167 // Used to simulate an unresponsive behaviour of the consumer.
168 void SimulateResponsiveness(bool is_responsive
) {
169 is_responsive_
= is_responsive
;
172 media::AudioInputBuffer
* GetAudioInputBuffer() const {
173 return static_cast<media::AudioInputBuffer
*>(shared_memory_
->memory());
176 MockSyncSocket
* sending_socket() { return sending_socket_
; }
177 media::AudioBus
* audio_bus() const { return audio_track_bus_
.get(); }
183 // Shared memory for the audio and synchronization.
184 scoped_ptr
<base::SharedMemory
> shared_memory_
;
186 // Fake sockets and their shared buffer.
187 scoped_ptr
<MockSyncSocket::SharedBuffer
> shared_buffer_
;
188 scoped_ptr
<MockSyncSocket
> receiving_socket_
;
189 MockSyncSocket
* sending_socket_
;
191 // Audio bus wrapping the shared memory from the renderer.
192 scoped_ptr
<media::AudioBus
> audio_track_bus_
;
194 DISALLOW_COPY_AND_ASSIGN(FakeSpeechRecognizer
);
201 class SpeechRecognitionAudioSinkTest
: public testing::Test
{
203 SpeechRecognitionAudioSinkTest() {}
205 ~SpeechRecognitionAudioSinkTest() {}
207 // Initializes the producer and consumer with specified audio parameters.
208 // Returns the minimal number of input audio buffers which need to be captured
209 // before they get sent to the consumer.
210 uint32
Initialize(int input_sample_rate
,
211 int input_frames_per_buffer
,
212 int output_sample_rate
,
213 int output_frames_per_buffer
) {
214 // Audio Environment setup.
215 source_params_
.Reset(kInputFormat
,
219 input_frames_per_buffer
);
220 sink_params_
.Reset(kOutputFormat
,
221 kOutputChannelLayout
,
223 kOutputBitsPerSample
,
224 output_frames_per_buffer
);
226 media::AudioBus::Create(kInputChannels
, input_frames_per_buffer
);
228 first_frame_capture_time_
= base::TimeTicks::Now();
229 sample_frames_captured_
= 0;
231 // Prepare the track and audio source.
232 blink::WebMediaStreamTrack blink_track
;
233 PrepareBlinkTrackOfType(MEDIA_DEVICE_AUDIO_CAPTURE
, &blink_track
);
235 // Get the native track from the blink track and initialize.
237 static_cast<WebRtcLocalAudioTrack
*>(blink_track
.extraData());
238 native_track_
->OnSetFormat(source_params_
);
240 // Create and initialize the consumer.
241 recognizer_
.reset(new FakeSpeechRecognizer());
242 base::SharedMemoryHandle foreign_memory_handle
;
243 recognizer_
->Initialize(blink_track
, sink_params_
, &foreign_memory_handle
);
245 // Create the producer.
246 scoped_ptr
<base::SyncSocket
> sending_socket(recognizer_
->sending_socket());
247 speech_audio_sink_
.reset(new SpeechRecognitionAudioSink(
248 blink_track
, sink_params_
, foreign_memory_handle
,
249 sending_socket
.Pass(),
250 base::Bind(&SpeechRecognitionAudioSinkTest::StoppedCallback
,
251 base::Unretained(this))));
253 // Return number of buffers needed to trigger resampling and consumption.
254 return static_cast<uint32
>(std::ceil(
255 static_cast<double>(output_frames_per_buffer
* input_sample_rate
) /
256 (input_frames_per_buffer
* output_sample_rate
)));
259 // Mock callback expected to be called when the track is stopped.
260 MOCK_METHOD0(StoppedCallback
, void());
263 // Prepares a blink track of a given MediaStreamType and attaches the native
264 // track which can be used to capture audio data and pass it to the producer.
265 static void PrepareBlinkTrackOfType(
266 const MediaStreamType device_type
,
267 blink::WebMediaStreamTrack
* blink_track
) {
268 StreamDeviceInfo
device_info(device_type
, "Mock device",
270 MockMediaConstraintFactory constraint_factory
;
271 const blink::WebMediaConstraints constraints
=
272 constraint_factory
.CreateWebMediaConstraints();
273 scoped_refptr
<WebRtcAudioCapturer
> capturer(
274 WebRtcAudioCapturer::CreateCapturer(-1, device_info
, constraints
, NULL
,
276 scoped_refptr
<WebRtcLocalAudioTrackAdapter
> adapter(
277 WebRtcLocalAudioTrackAdapter::Create(std::string(), NULL
));
278 scoped_ptr
<WebRtcLocalAudioTrack
> native_track(
279 new WebRtcLocalAudioTrack(adapter
.get(), capturer
, NULL
));
280 blink::WebMediaStreamSource blink_audio_source
;
281 blink_audio_source
.initialize(base::UTF8ToUTF16("dummy_source_id"),
282 blink::WebMediaStreamSource::TypeAudio
,
283 base::UTF8ToUTF16("dummy_source_name"),
284 false /* remote */, true /* readonly */);
285 MediaStreamSource::SourceStoppedCallback cb
;
286 blink_audio_source
.setExtraData(
287 new MediaStreamAudioSource(-1, device_info
, cb
, NULL
));
288 blink_track
->initialize(blink::WebString::fromUTF8("dummy_track"),
290 blink_track
->setExtraData(native_track
.release());
293 // Emulates an audio capture device capturing data from the source.
294 inline void CaptureAudio(const uint32 buffers
) {
295 for (uint32 i
= 0; i
< buffers
; ++i
) {
296 const base::TimeTicks estimated_capture_time
= first_frame_capture_time_
+
297 (sample_frames_captured_
* base::TimeDelta::FromSeconds(1) /
298 source_params_
.sample_rate());
299 native_track()->Capture(*source_bus_
, estimated_capture_time
, false);
300 sample_frames_captured_
+= source_bus_
->frames();
304 // Used to simulate a problem with sockets.
305 void SetFailureModeOnForeignSocket(bool in_failure_mode
) {
306 recognizer()->sending_socket()->SetFailureMode(in_failure_mode
);
309 // Helper method for verifying captured audio data has been consumed.
310 inline void AssertConsumedBuffers(const uint32 buffer_index
) {
311 ASSERT_EQ(buffer_index
, recognizer()->GetAudioInputBuffer()->params
.size
);
314 // Helper method for providing audio data to producer and verifying it was
315 // consumed on the recognizer.
316 inline void CaptureAudioAndAssertConsumedBuffers(const uint32 buffers
,
317 const uint32 buffer_index
) {
318 CaptureAudio(buffers
);
319 AssertConsumedBuffers(buffer_index
);
322 // Helper method to capture and assert consumption at different sample rates
323 // and audio buffer sizes.
324 inline void AssertConsumptionForAudioParameters(
325 const int input_sample_rate
,
326 const int input_frames_per_buffer
,
327 const int output_sample_rate
,
328 const int output_frames_per_buffer
,
329 const uint32 consumptions
) {
330 const uint32 buffers_per_notification
=
331 Initialize(input_sample_rate
,
332 input_frames_per_buffer
,
334 output_frames_per_buffer
);
335 AssertConsumedBuffers(0U);
337 for (uint32 i
= 1U; i
<= consumptions
; ++i
) {
338 CaptureAudio(buffers_per_notification
);
339 ASSERT_EQ(i
, recognizer()->GetAudioInputBuffer()->params
.size
)
340 << "Tested at rates: "
341 << "In(" << input_sample_rate
<< ", " << input_frames_per_buffer
343 << "Out(" << output_sample_rate
<< ", " << output_frames_per_buffer
348 media::AudioBus
* source_bus() const { return source_bus_
.get(); }
350 FakeSpeechRecognizer
* recognizer() const { return recognizer_
.get(); }
352 const media::AudioParameters
& sink_params() const { return sink_params_
; }
354 WebRtcLocalAudioTrack
* native_track() const { return native_track_
; }
358 scoped_ptr
<SpeechRecognitionAudioSink
> speech_audio_sink_
;
361 scoped_ptr
<FakeSpeechRecognizer
> recognizer_
;
363 // Audio related members.
364 scoped_ptr
<media::AudioBus
> source_bus_
;
365 media::AudioParameters source_params_
;
366 media::AudioParameters sink_params_
;
367 WebRtcLocalAudioTrack
* native_track_
;
369 base::TimeTicks first_frame_capture_time_
;
370 int64 sample_frames_captured_
;
372 DISALLOW_COPY_AND_ASSIGN(SpeechRecognitionAudioSinkTest
);
375 // Not all types of tracks are supported. This test checks if that policy is
376 // implemented correctly.
377 TEST_F(SpeechRecognitionAudioSinkTest
, CheckIsSupportedAudioTrack
) {
378 typedef std::map
<MediaStreamType
, bool> SupportedTrackPolicy
;
380 // This test must be aligned with the policy of supported tracks.
381 SupportedTrackPolicy p
;
382 p
[MEDIA_NO_SERVICE
] = false;
383 p
[MEDIA_DEVICE_AUDIO_CAPTURE
] = true; // The only one supported for now.
384 p
[MEDIA_DEVICE_VIDEO_CAPTURE
] = false;
385 p
[MEDIA_TAB_AUDIO_CAPTURE
] = false;
386 p
[MEDIA_TAB_VIDEO_CAPTURE
] = false;
387 p
[MEDIA_DESKTOP_VIDEO_CAPTURE
] = false;
388 p
[MEDIA_DESKTOP_AUDIO_CAPTURE
] = false;
389 p
[MEDIA_DEVICE_AUDIO_OUTPUT
] = false;
391 // Ensure this test gets updated along with |content::MediaStreamType| enum.
392 EXPECT_EQ(NUM_MEDIA_TYPES
, p
.size());
394 // Check the the entire policy.
395 for (SupportedTrackPolicy::iterator it
= p
.begin(); it
!= p
.end(); ++it
) {
396 blink::WebMediaStreamTrack blink_track
;
397 PrepareBlinkTrackOfType(it
->first
, &blink_track
);
400 SpeechRecognitionAudioSink::IsSupportedTrack(blink_track
));
404 // Checks if the producer can support the listed range of input sample rates
405 // and associated buffer sizes.
406 TEST_F(SpeechRecognitionAudioSinkTest
, RecognizerNotifiedOnSocket
) {
407 const size_t kNumAudioParamTuples
= 24;
408 const int kAudioParams
[kNumAudioParamTuples
][2] = {
409 {8000, 80}, {8000, 800}, {16000, 160}, {16000, 1600},
410 {24000, 240}, {24000, 2400}, {32000, 320}, {32000, 3200},
411 {44100, 441}, {44100, 4410}, {48000, 480}, {48000, 4800},
412 {96000, 960}, {96000, 9600}, {11025, 111}, {11025, 1103},
413 {22050, 221}, {22050, 2205}, {88200, 882}, {88200, 8820},
414 {176400, 1764}, {176400, 17640}, {192000, 1920}, {192000, 19200}};
416 // Check all listed tuples of input sample rates and buffers sizes.
417 for (size_t i
= 0; i
< kNumAudioParamTuples
; ++i
) {
418 AssertConsumptionForAudioParameters(
419 kAudioParams
[i
][0], kAudioParams
[i
][1],
420 kSpeechRecognitionSampleRate
, kSpeechRecognitionFramesPerBuffer
, 3U);
424 // Checks that the input data is getting resampled to the target sample rate.
425 TEST_F(SpeechRecognitionAudioSinkTest
, AudioDataIsResampledOnSink
) {
426 EXPECT_GE(kInputChannels
, 1);
427 EXPECT_GE(kOutputChannels
, 1);
429 // Input audio is sampled at 44.1 KHz with data chunks of 10ms. Desired output
430 // is corresponding to the speech recognition engine requirements: 16 KHz with
431 // 100 ms chunks (1600 frames per buffer).
432 const uint32 kSourceFrames
= 441;
433 const uint32 buffers_per_notification
=
434 Initialize(44100, kSourceFrames
, 16000, 1600);
435 // Fill audio input frames with 0, 1, 2, 3, ..., 440.
436 int16 source_data
[kSourceFrames
* kInputChannels
];
437 for (uint32 i
= 0; i
< kSourceFrames
; ++i
) {
438 for (int c
= 0; c
< kInputChannels
; ++c
)
439 source_data
[i
* kInputChannels
+ c
] = i
;
441 source_bus()->FromInterleaved(
442 source_data
, kSourceFrames
, sizeof(source_data
[0]));
444 // Prepare sink audio bus and data for rendering.
445 media::AudioBus
* sink_bus
= recognizer()->audio_bus();
446 const uint32 kSinkDataLength
= 1600 * kOutputChannels
;
447 int16 sink_data
[kSinkDataLength
] = {0};
449 // Render the audio data from the recognizer.
450 sink_bus
->ToInterleaved(sink_bus
->frames(),
451 sink_params().bits_per_sample() / 8, sink_data
);
453 // Checking only a fraction of the sink frames.
454 const uint32 kNumFramesToTest
= 12;
456 // Check all channels are zeroed out before we trigger resampling.
457 for (uint32 i
= 0; i
< kNumFramesToTest
; ++i
) {
458 for (int c
= 0; c
< kOutputChannels
; ++c
)
459 EXPECT_EQ(0, sink_data
[i
* kOutputChannels
+ c
]);
462 // Trigger the speech sink to resample the input data.
463 AssertConsumedBuffers(0U);
464 CaptureAudioAndAssertConsumedBuffers(buffers_per_notification
, 1U);
466 // Render the audio data from the recognizer.
467 sink_bus
->ToInterleaved(sink_bus
->frames(),
468 sink_params().bits_per_sample() / 8, sink_data
);
470 // Resampled data expected frames. Extracted based on |source_data|.
471 const int16 kExpectedData
[kNumFramesToTest
] = {0, 2, 5, 8, 11, 13,
472 16, 19, 22, 24, 27, 30};
474 // Check all channels have the same resampled data.
475 for (uint32 i
= 0; i
< kNumFramesToTest
; ++i
) {
476 for (int c
= 0; c
< kOutputChannels
; ++c
)
477 EXPECT_EQ(kExpectedData
[i
], sink_data
[i
* kOutputChannels
+ c
]);
481 // Checks that the producer does not misbehave when a socket failure occurs.
482 TEST_F(SpeechRecognitionAudioSinkTest
, SyncSocketFailsSendingData
) {
483 const uint32 buffers_per_notification
= Initialize(44100, 441, 16000, 1600);
484 // Start with no problems on the socket.
485 AssertConsumedBuffers(0U);
486 CaptureAudioAndAssertConsumedBuffers(buffers_per_notification
, 1U);
488 // A failure occurs (socket cannot send).
489 SetFailureModeOnForeignSocket(true);
490 CaptureAudioAndAssertConsumedBuffers(buffers_per_notification
, 1U);
493 // A very unlikely scenario in which the peer is not synchronizing for a long
494 // time (e.g. 300 ms) which results in dropping cached buffers and restarting.
495 // We check that the FIFO overflow does not occur and that the producer is able
497 TEST_F(SpeechRecognitionAudioSinkTest
, RepeatedSycnhronizationLag
) {
498 const uint32 buffers_per_notification
= Initialize(44100, 441, 16000, 1600);
500 // Start with no synchronization problems.
501 AssertConsumedBuffers(0U);
502 CaptureAudioAndAssertConsumedBuffers(buffers_per_notification
, 1U);
504 // Consumer gets out of sync.
505 recognizer()->SimulateResponsiveness(false);
506 CaptureAudioAndAssertConsumedBuffers(buffers_per_notification
, 1U);
507 CaptureAudioAndAssertConsumedBuffers(buffers_per_notification
, 1U);
508 CaptureAudioAndAssertConsumedBuffers(buffers_per_notification
, 1U);
510 // Consumer recovers.
511 recognizer()->SimulateResponsiveness(true);
512 CaptureAudioAndAssertConsumedBuffers(buffers_per_notification
, 2U);
513 CaptureAudioAndAssertConsumedBuffers(buffers_per_notification
, 3U);
514 CaptureAudioAndAssertConsumedBuffers(buffers_per_notification
, 4U);
517 // Checks that an OnStoppedCallback is issued when the track is stopped.
518 TEST_F(SpeechRecognitionAudioSinkTest
, OnReadyStateChangedOccured
) {
519 const uint32 buffers_per_notification
= Initialize(44100, 441, 16000, 1600);
520 AssertConsumedBuffers(0U);
521 CaptureAudioAndAssertConsumedBuffers(buffers_per_notification
, 1U);
522 EXPECT_CALL(*this, StoppedCallback()).Times(1);
524 native_track()->Stop();
525 CaptureAudioAndAssertConsumedBuffers(buffers_per_notification
, 1U);
528 } // namespace content