2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
12 #include "threading.h"
16 #if CONFIG_MULTITHREAD
18 extern int vp8cx_encode_inter_macroblock(VP8_COMP
*cpi
, MACROBLOCK
*x
,
19 TOKENEXTRA
**t
, int recon_yoffset
,
21 extern int vp8cx_encode_intra_macro_block(VP8_COMP
*cpi
, MACROBLOCK
*x
,
23 extern void vp8cx_mb_init_quantizer(VP8_COMP
*cpi
, MACROBLOCK
*x
);
24 extern void vp8_build_block_offsets(MACROBLOCK
*x
);
25 extern void vp8_setup_block_ptrs(MACROBLOCK
*x
);
28 THREAD_FUNCTION
thread_encoding_proc(void *p_data
)
30 int ithread
= ((ENCODETHREAD_DATA
*)p_data
)->ithread
;
31 VP8_COMP
*cpi
= (VP8_COMP
*)(((ENCODETHREAD_DATA
*)p_data
)->ptr1
);
32 MB_ROW_COMP
*mbri
= (MB_ROW_COMP
*)(((ENCODETHREAD_DATA
*)p_data
)->ptr2
);
33 ENTROPY_CONTEXT_PLANES mb_row_left_context
;
35 const int nsync
= cpi
->mt_sync_range
;
36 //printf("Started thread %d\n", ithread);
40 if (cpi
->b_multi_threaded
== 0)
43 //if(WaitForSingleObject(cpi->h_event_mbrencoding[ithread], INFINITE) == WAIT_OBJECT_0)
44 if (sem_wait(&cpi
->h_event_start_encoding
[ithread
]) == 0)
46 VP8_COMMON
*cm
= &cpi
->common
;
48 MACROBLOCK
*x
= &mbri
->mb
;
49 MACROBLOCKD
*xd
= &x
->e_mbd
;
52 int *segment_counts
= mbri
->segment_counts
;
53 int *totalrate
= &mbri
->totalrate
;
55 if (cpi
->b_multi_threaded
== FALSE
) // we're shutting down
58 for (mb_row
= ithread
+ 1; mb_row
< cm
->mb_rows
; mb_row
+= (cpi
->encoding_thread_count
+ 1))
62 int recon_yoffset
, recon_uvoffset
;
64 int ref_fb_idx
= cm
->lst_fb_idx
;
65 int dst_fb_idx
= cm
->new_fb_idx
;
66 int recon_y_stride
= cm
->yv12_fb
[ref_fb_idx
].y_stride
;
67 int recon_uv_stride
= cm
->yv12_fb
[ref_fb_idx
].uv_stride
;
68 volatile int *last_row_current_mb_col
;
69 INT64 activity_sum
= 0;
71 tp
= cpi
->tok
+ (mb_row
* (cm
->mb_cols
* 16 * 24));
73 last_row_current_mb_col
= &cpi
->mt_current_mb_col
[mb_row
- 1];
75 // reset above block coeffs
76 xd
->above_context
= cm
->above_context
;
77 xd
->left_context
= &mb_row_left_context
;
79 vp8_zero(mb_row_left_context
);
81 xd
->up_available
= (mb_row
!= 0);
82 recon_yoffset
= (mb_row
* recon_y_stride
* 16);
83 recon_uvoffset
= (mb_row
* recon_uv_stride
* 8);
85 cpi
->tplist
[mb_row
].start
= tp
;
87 //printf("Thread mb_row = %d\n", mb_row);
89 // for each macroblock col in image
90 for (mb_col
= 0; mb_col
< cm
->mb_cols
; mb_col
++)
92 int seg_map_index
= (mb_row
* cm
->mb_cols
);
94 if ((mb_col
& (nsync
- 1)) == 0)
96 while (mb_col
> (*last_row_current_mb_col
- nsync
) && *last_row_current_mb_col
!= cm
->mb_cols
- 1)
103 // Distance of Mb to the various image edges.
104 // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
105 xd
->mb_to_left_edge
= -((mb_col
* 16) << 3);
106 xd
->mb_to_right_edge
= ((cm
->mb_cols
- 1 - mb_col
) * 16) << 3;
107 xd
->mb_to_top_edge
= -((mb_row
* 16) << 3);
108 xd
->mb_to_bottom_edge
= ((cm
->mb_rows
- 1 - mb_row
) * 16) << 3;
110 // Set up limit values for motion vectors used to prevent them extending outside the UMV borders
111 x
->mv_col_min
= -((mb_col
* 16) + (VP8BORDERINPIXELS
- 16));
112 x
->mv_col_max
= ((cm
->mb_cols
- 1 - mb_col
) * 16) + (VP8BORDERINPIXELS
- 16);
113 x
->mv_row_min
= -((mb_row
* 16) + (VP8BORDERINPIXELS
- 16));
114 x
->mv_row_max
= ((cm
->mb_rows
- 1 - mb_row
) * 16) + (VP8BORDERINPIXELS
- 16);
116 xd
->dst
.y_buffer
= cm
->yv12_fb
[dst_fb_idx
].y_buffer
+ recon_yoffset
;
117 xd
->dst
.u_buffer
= cm
->yv12_fb
[dst_fb_idx
].u_buffer
+ recon_uvoffset
;
118 xd
->dst
.v_buffer
= cm
->yv12_fb
[dst_fb_idx
].v_buffer
+ recon_uvoffset
;
119 xd
->left_available
= (mb_col
!= 0);
121 x
->rddiv
= cpi
->RDDIV
;
122 x
->rdmult
= cpi
->RDMULT
;
124 if (cpi
->oxcf
.tuning
== VP8_TUNE_SSIM
)
125 activity_sum
+= vp8_activity_masking(cpi
, x
);
127 // Is segmentation enabled
128 // MB level adjutment to quantizer
129 if (xd
->segmentation_enabled
)
131 // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking)
132 if (cpi
->segmentation_map
[seg_map_index
+ mb_col
] <= 3)
133 xd
->mode_info_context
->mbmi
.segment_id
= cpi
->segmentation_map
[seg_map_index
+ mb_col
];
135 xd
->mode_info_context
->mbmi
.segment_id
= 0;
137 vp8cx_mb_init_quantizer(cpi
, x
);
140 xd
->mode_info_context
->mbmi
.segment_id
= 0; // Set to Segment 0 by default
142 x
->active_ptr
= cpi
->active_map
+ seg_map_index
+ mb_col
;
144 if (cm
->frame_type
== KEY_FRAME
)
146 *totalrate
+= vp8cx_encode_intra_macro_block(cpi
, x
, &tp
);
148 y_modes
[xd
->mbmi
.mode
] ++;
153 *totalrate
+= vp8cx_encode_inter_macroblock(cpi
, x
, &tp
, recon_yoffset
, recon_uvoffset
);
156 inter_y_modes
[xd
->mbmi
.mode
] ++;
158 if (xd
->mbmi
.mode
== SPLITMV
)
162 for (b
= 0; b
< xd
->mbmi
.partition_count
; b
++)
164 inter_b_modes
[x
->partition
->bmi
[b
].mode
] ++;
170 // Count of last ref frame 0,0 useage
171 if ((xd
->mode_info_context
->mbmi
.mode
== ZEROMV
) && (xd
->mode_info_context
->mbmi
.ref_frame
== LAST_FRAME
))
172 cpi
->inter_zz_count
++;
174 // Special case code for cyclic refresh
175 // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode
176 // during vp8cx_encode_inter_macroblock()) back into the global sgmentation map
177 if (cpi
->cyclic_refresh_mode_enabled
&& xd
->segmentation_enabled
)
179 const MB_MODE_INFO
* mbmi
= &xd
->mode_info_context
->mbmi
;
180 cpi
->segmentation_map
[seg_map_index
+ mb_col
] = mbmi
->segment_id
;
182 // If the block has been refreshed mark it as clean (the magnitude of the -ve influences how long it will be before we consider another refresh):
183 // Else if it was coded (last frame 0,0) and has not already been refreshed then mark it as a candidate for cleanup next time (marked 0)
184 // else mark it as dirty (1).
185 if (mbmi
->segment_id
)
186 cpi
->cyclic_refresh_map
[seg_map_index
+ mb_col
] = -1;
187 else if ((mbmi
->mode
== ZEROMV
) && (mbmi
->ref_frame
== LAST_FRAME
))
189 if (cpi
->cyclic_refresh_map
[seg_map_index
+ mb_col
] == 1)
190 cpi
->cyclic_refresh_map
[seg_map_index
+ mb_col
] = 0;
193 cpi
->cyclic_refresh_map
[seg_map_index
+ mb_col
] = 1;
197 cpi
->tplist
[mb_row
].stop
= tp
;
199 x
->gf_active_ptr
++; // Increment pointer into gf useage flags structure for next mb
201 for (i
= 0; i
< 16; i
++)
202 vpx_memcpy(&xd
->mode_info_context
->bmi
[i
], &xd
->block
[i
].bmi
, sizeof(xd
->block
[i
].bmi
));
204 // adjust to the next column of macroblocks
205 x
->src
.y_buffer
+= 16;
206 x
->src
.u_buffer
+= 8;
207 x
->src
.v_buffer
+= 8;
212 // Keep track of segment useage
213 segment_counts
[xd
->mode_info_context
->mbmi
.segment_id
]++;
216 xd
->mode_info_context
++;
220 cpi
->mt_current_mb_col
[mb_row
] = mb_col
;
223 //extend the recon for intra prediction
225 &cm
->yv12_fb
[dst_fb_idx
],
226 xd
->dst
.y_buffer
+ 16,
227 xd
->dst
.u_buffer
+ 8,
228 xd
->dst
.v_buffer
+ 8);
230 // this is to account for the border
231 xd
->mode_info_context
++;
233 x
->activity_sum
+= activity_sum
;
235 x
->src
.y_buffer
+= 16 * x
->src
.y_stride
* (cpi
->encoding_thread_count
+ 1) - 16 * cm
->mb_cols
;
236 x
->src
.u_buffer
+= 8 * x
->src
.uv_stride
* (cpi
->encoding_thread_count
+ 1) - 8 * cm
->mb_cols
;
237 x
->src
.v_buffer
+= 8 * x
->src
.uv_stride
* (cpi
->encoding_thread_count
+ 1) - 8 * cm
->mb_cols
;
239 xd
->mode_info_context
+= xd
->mode_info_stride
* cpi
->encoding_thread_count
;
240 x
->partition_info
+= xd
->mode_info_stride
* cpi
->encoding_thread_count
;
242 if (mb_row
== cm
->mb_rows
- 1)
244 //SetEvent(cpi->h_event_main);
245 sem_post(&cpi
->h_event_end_encoding
); /* signal frame encoding end */
251 //printf("exit thread %d\n", ithread);
255 static void setup_mbby_copy(MACROBLOCK
*mbdst
, MACROBLOCK
*mbsrc
)
258 MACROBLOCK
*x
= mbsrc
;
259 MACROBLOCK
*z
= mbdst
;
263 z
->ss_count
= x
->ss_count
;
264 z
->searches_per_step
= x
->searches_per_step
;
265 z
->errorperbit
= x
->errorperbit
;
267 z
->sadperbit16
= x
->sadperbit16
;
268 z
->sadperbit4
= x
->sadperbit4
;
269 z
->errthresh
= x
->errthresh
;
272 z->mv_col_min = x->mv_col_min;
273 z->mv_col_max = x->mv_col_max;
274 z->mv_row_min = x->mv_row_min;
275 z->mv_row_max = x->mv_row_max;
276 z->vector_range = x->vector_range ;
279 z
->vp8_short_fdct4x4
= x
->vp8_short_fdct4x4
;
280 z
->vp8_short_fdct8x4
= x
->vp8_short_fdct8x4
;
281 z
->short_walsh4x4
= x
->short_walsh4x4
;
282 z
->quantize_b
= x
->quantize_b
;
283 z
->optimize
= x
->optimize
;
287 z->src.y_buffer = x->src.y_buffer;
288 z->src.u_buffer = x->src.u_buffer;
289 z->src.v_buffer = x->src.v_buffer;
293 vpx_memcpy(z
->mvcosts
, x
->mvcosts
, sizeof(x
->mvcosts
));
294 z
->mvcost
[0] = &z
->mvcosts
[0][mv_max
+1];
295 z
->mvcost
[1] = &z
->mvcosts
[1][mv_max
+1];
296 z
->mvsadcost
[0] = &z
->mvsadcosts
[0][mv_max
+1];
297 z
->mvsadcost
[1] = &z
->mvsadcosts
[1][mv_max
+1];
300 vpx_memcpy(z
->token_costs
, x
->token_costs
, sizeof(x
->token_costs
));
301 vpx_memcpy(z
->inter_bmode_costs
, x
->inter_bmode_costs
, sizeof(x
->inter_bmode_costs
));
302 //memcpy(z->mvcosts, x->mvcosts, sizeof(x->mvcosts));
303 //memcpy(z->mvcost, x->mvcost, sizeof(x->mvcost));
304 vpx_memcpy(z
->mbmode_cost
, x
->mbmode_cost
, sizeof(x
->mbmode_cost
));
305 vpx_memcpy(z
->intra_uv_mode_cost
, x
->intra_uv_mode_cost
, sizeof(x
->intra_uv_mode_cost
));
306 vpx_memcpy(z
->bmode_costs
, x
->bmode_costs
, sizeof(x
->bmode_costs
));
308 for (i
= 0; i
< 25; i
++)
310 z
->block
[i
].quant
= x
->block
[i
].quant
;
311 z
->block
[i
].quant_fast
= x
->block
[i
].quant_fast
;
312 z
->block
[i
].quant_shift
= x
->block
[i
].quant_shift
;
313 z
->block
[i
].zbin
= x
->block
[i
].zbin
;
314 z
->block
[i
].zrun_zbin_boost
= x
->block
[i
].zrun_zbin_boost
;
315 z
->block
[i
].round
= x
->block
[i
].round
;
317 z->block[i].src = x->block[i].src;
319 z
->block
[i
].src_stride
= x
->block
[i
].src_stride
;
320 z
->block
[i
].force_empty
= x
->block
[i
].force_empty
;
325 MACROBLOCKD
*xd
= &x
->e_mbd
;
326 MACROBLOCKD
*zd
= &z
->e_mbd
;
329 zd->mode_info_context = xd->mode_info_context;
330 zd->mode_info = xd->mode_info;
332 zd->mode_info_stride = xd->mode_info_stride;
333 zd->frame_type = xd->frame_type;
334 zd->up_available = xd->up_available ;
335 zd->left_available = xd->left_available;
336 zd->left_context = xd->left_context;
337 zd->last_frame_dc = xd->last_frame_dc;
338 zd->last_frame_dccons = xd->last_frame_dccons;
339 zd->gold_frame_dc = xd->gold_frame_dc;
340 zd->gold_frame_dccons = xd->gold_frame_dccons;
341 zd->mb_to_left_edge = xd->mb_to_left_edge;
342 zd->mb_to_right_edge = xd->mb_to_right_edge;
343 zd->mb_to_top_edge = xd->mb_to_top_edge ;
344 zd->mb_to_bottom_edge = xd->mb_to_bottom_edge;
345 zd->gf_active_ptr = xd->gf_active_ptr;
346 zd->frames_since_golden = xd->frames_since_golden;
347 zd->frames_till_alt_ref_frame = xd->frames_till_alt_ref_frame;
349 zd
->subpixel_predict
= xd
->subpixel_predict
;
350 zd
->subpixel_predict8x4
= xd
->subpixel_predict8x4
;
351 zd
->subpixel_predict8x8
= xd
->subpixel_predict8x8
;
352 zd
->subpixel_predict16x16
= xd
->subpixel_predict16x16
;
353 zd
->segmentation_enabled
= xd
->segmentation_enabled
;
354 zd
->mb_segement_abs_delta
= xd
->mb_segement_abs_delta
;
355 vpx_memcpy(zd
->segment_feature_data
, xd
->segment_feature_data
, sizeof(xd
->segment_feature_data
));
357 for (i
= 0; i
< 25; i
++)
359 zd
->block
[i
].dequant
= xd
->block
[i
].dequant
;
364 void vp8cx_init_mbrthread_data(VP8_COMP
*cpi
,
372 VP8_COMMON
*const cm
= & cpi
->common
;
373 MACROBLOCKD
*const xd
= & x
->e_mbd
;
377 for (i
= 0; i
< count
; i
++)
379 MACROBLOCK
*mb
= & mbr_ei
[i
].mb
;
380 MACROBLOCKD
*mbd
= &mb
->e_mbd
;
382 mbd
->subpixel_predict
= xd
->subpixel_predict
;
383 mbd
->subpixel_predict8x4
= xd
->subpixel_predict8x4
;
384 mbd
->subpixel_predict8x8
= xd
->subpixel_predict8x8
;
385 mbd
->subpixel_predict16x16
= xd
->subpixel_predict16x16
;
386 #if CONFIG_RUNTIME_CPU_DETECT
387 mbd
->rtcd
= xd
->rtcd
;
389 mb
->gf_active_ptr
= x
->gf_active_ptr
;
391 mb
->vector_range
= 32;
393 vpx_memset(mbr_ei
[i
].segment_counts
, 0, sizeof(mbr_ei
[i
].segment_counts
));
394 mbr_ei
[i
].totalrate
= 0;
396 mb
->partition_info
= x
->pi
+ x
->e_mbd
.mode_info_stride
* (i
+ 1);
398 mbd
->mode_info_context
= cm
->mi
+ x
->e_mbd
.mode_info_stride
* (i
+ 1);
399 mbd
->mode_info_stride
= cm
->mode_info_stride
;
401 mbd
->frame_type
= cm
->frame_type
;
403 mbd
->frames_since_golden
= cm
->frames_since_golden
;
404 mbd
->frames_till_alt_ref_frame
= cm
->frames_till_alt_ref_frame
;
406 mb
->src
= * cpi
->Source
;
407 mbd
->pre
= cm
->yv12_fb
[cm
->lst_fb_idx
];
408 mbd
->dst
= cm
->yv12_fb
[cm
->new_fb_idx
];
410 mb
->src
.y_buffer
+= 16 * x
->src
.y_stride
* (i
+ 1);
411 mb
->src
.u_buffer
+= 8 * x
->src
.uv_stride
* (i
+ 1);
412 mb
->src
.v_buffer
+= 8 * x
->src
.uv_stride
* (i
+ 1);
414 vp8_build_block_offsets(mb
);
416 vp8_setup_block_dptrs(mbd
);
418 vp8_setup_block_ptrs(mb
);
420 mb
->activity_sum
= 0;
422 mbd
->left_context
= &cm
->left_context
;
423 mb
->mvc
= cm
->fc
.mvc
;
425 setup_mbby_copy(&mbr_ei
[i
].mb
, x
);
430 void vp8cx_create_encoder_threads(VP8_COMP
*cpi
)
432 cpi
->b_multi_threaded
= 0;
434 cpi
->processor_core_count
= 32; //vp8_get_proc_core_count();
436 if (cpi
->processor_core_count
> 1 && cpi
->oxcf
.multi_threaded
> 1)
440 if (cpi
->oxcf
.multi_threaded
> cpi
->processor_core_count
)
441 cpi
->encoding_thread_count
= cpi
->processor_core_count
- 1;
443 cpi
->encoding_thread_count
= cpi
->oxcf
.multi_threaded
- 1;
445 CHECK_MEM_ERROR(cpi
->h_encoding_thread
, vpx_malloc(sizeof(pthread_t
) * cpi
->encoding_thread_count
));
446 CHECK_MEM_ERROR(cpi
->h_event_start_encoding
, vpx_malloc(sizeof(sem_t
) * cpi
->encoding_thread_count
));
447 CHECK_MEM_ERROR(cpi
->mb_row_ei
, vpx_memalign(32, sizeof(MB_ROW_COMP
) * cpi
->encoding_thread_count
));
448 vpx_memset(cpi
->mb_row_ei
, 0, sizeof(MB_ROW_COMP
) * cpi
->encoding_thread_count
);
449 CHECK_MEM_ERROR(cpi
->en_thread_data
, vpx_malloc(sizeof(ENCODETHREAD_DATA
) * cpi
->encoding_thread_count
));
450 CHECK_MEM_ERROR(cpi
->mt_current_mb_col
, vpx_malloc(sizeof(*cpi
->mt_current_mb_col
) * cpi
->common
.mb_rows
));
452 //cpi->h_event_main = CreateEvent(NULL, FALSE, FALSE, NULL);
453 sem_init(&cpi
->h_event_end_encoding
, 0, 0);
455 cpi
->b_multi_threaded
= 1;
457 //printf("[VP8:] multi_threaded encoding is enabled with %d threads\n\n", (cpi->encoding_thread_count +1));
459 for (ithread
= 0; ithread
< cpi
->encoding_thread_count
; ithread
++)
461 ENCODETHREAD_DATA
* ethd
= &cpi
->en_thread_data
[ithread
];
463 //cpi->h_event_mbrencoding[ithread] = CreateEvent(NULL, FALSE, FALSE, NULL);
464 sem_init(&cpi
->h_event_start_encoding
[ithread
], 0, 0);
465 ethd
->ithread
= ithread
;
466 ethd
->ptr1
= (void *)cpi
;
467 ethd
->ptr2
= (void *)&cpi
->mb_row_ei
[ithread
];
469 //printf(" call begin thread %d \n", ithread);
471 //cpi->h_encoding_thread[ithread] = (HANDLE)_beginthreadex(
474 // thread_encoding_proc,
475 // (&cpi->en_thread_data[ithread]), // Thread data
479 pthread_create(&cpi
->h_encoding_thread
[ithread
], 0, thread_encoding_proc
, ethd
);
486 void vp8cx_remove_encoder_threads(VP8_COMP
*cpi
)
488 if (cpi
->b_multi_threaded
)
490 //shutdown other threads
491 cpi
->b_multi_threaded
= 0;
495 for (i
= 0; i
< cpi
->encoding_thread_count
; i
++)
497 //SetEvent(cpi->h_event_mbrencoding[i]);
498 sem_post(&cpi
->h_event_start_encoding
[i
]);
499 pthread_join(cpi
->h_encoding_thread
[i
], 0);
501 sem_destroy(&cpi
->h_event_start_encoding
[i
]);
505 sem_destroy(&cpi
->h_event_end_encoding
);
507 //free thread related resources
508 vpx_free(cpi
->h_event_start_encoding
);
509 vpx_free(cpi
->h_encoding_thread
);
510 vpx_free(cpi
->mb_row_ei
);
511 vpx_free(cpi
->en_thread_data
);
512 vpx_free(cpi
->mt_current_mb_col
);