K-means weightp
[x264-7mod.git] / encoder / slicetype-cl.c
blob7768b7cb6737ee1a6dcf7298fbbce168b78ebf7b
1 /*****************************************************************************
2 * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
3 *****************************************************************************
4 * Copyright (C) 2012-2017 x264 project
6 * Authors: Steve Borho <sborho@multicorewareinc.com>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at licensing@x264.com.
24 *****************************************************************************/
26 #include "common/common.h"
27 #include "macroblock.h"
28 #include "me.h"
30 #if HAVE_OPENCL
31 #ifdef _WIN32
32 #include <windows.h>
33 #endif
35 void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
37 /* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not defined
38 * in the OpenCL headers shipped with NVIDIA drivers. We need to be
39 * able to compile on an NVIDIA machine and run optimally on an AMD GPU. */
40 #define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
42 #define OCLCHECK( method, ... )\
43 do\
45 if( h->opencl.b_fatal_error )\
46 return -1;\
47 status = ocl->method( __VA_ARGS__ );\
48 if( status != CL_SUCCESS ) {\
49 h->param.b_opencl = 0;\
50 h->opencl.b_fatal_error = 1;\
51 x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status );\
52 return -1;\
54 } while( 0 )
56 void x264_opencl_flush( x264_t *h )
58 x264_opencl_function_t *ocl = h->opencl.ocl;
60 ocl->clFinish( h->opencl.queue );
62 /* Finish copies from the GPU by copying from the page-locked buffer to
63 * their final destination */
64 for( int i = 0; i < h->opencl.num_copies; i++ )
65 memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes );
66 h->opencl.num_copies = 0;
67 h->opencl.pl_occupancy = 0;
70 static void *x264_opencl_alloc_locked( x264_t *h, int bytes )
72 if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE )
73 x264_opencl_flush( h );
74 assert( bytes < PAGE_LOCKED_BUF_SIZE );
75 char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy;
76 h->opencl.pl_occupancy += bytes;
77 return ptr;
80 int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
82 if( fenc->b_intra_calculated )
83 return 0;
84 fenc->b_intra_calculated = 1;
86 x264_opencl_function_t *ocl = h->opencl.ocl;
87 int luma_length = fenc->i_stride[0] * fenc->i_lines[0];
89 #define CREATEBUF( out, flags, size )\
90 out = ocl->clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
91 if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
92 #define CREATEIMAGE( out, flags, pf, width, height )\
93 out = ocl->clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
94 if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
96 int mb_count = h->mb.i_mb_count;
97 cl_int status;
99 if( !h->opencl.lowres_mv_costs )
101 /* Allocate shared memory buffers */
102 int width = h->mb.i_mb_width * 8 * sizeof(pixel);
103 int height = h->mb.i_mb_height * 8 * sizeof(pixel);
105 cl_image_format pixel_format;
106 pixel_format.image_channel_order = CL_R;
107 pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
108 CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
110 for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
112 pixel_format.image_channel_order = CL_RGBA;
113 pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
114 CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height );
115 width >>= 1;
116 height >>= 1;
119 CREATEBUF( h->opencl.lowres_mv_costs, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
120 CREATEBUF( h->opencl.lowres_costs[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
121 CREATEBUF( h->opencl.lowres_costs[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
122 CREATEBUF( h->opencl.mv_buffers[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
123 CREATEBUF( h->opencl.mv_buffers[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
124 CREATEBUF( h->opencl.mvp_buffer, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
125 CREATEBUF( h->opencl.frame_stats[0], CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
126 CREATEBUF( h->opencl.frame_stats[1], CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
127 CREATEBUF( h->opencl.row_satds[0], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
128 CREATEBUF( h->opencl.row_satds[1], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
129 CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY, luma_length );
130 CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY, luma_length );
133 if( !fenc->opencl.intra_cost )
135 /* Allocate per-frame buffers */
136 int width = h->mb.i_mb_width * 8 * sizeof(pixel);
137 int height = h->mb.i_mb_height * 8 * sizeof(pixel);
139 cl_image_format pixel_format;
140 pixel_format.image_channel_order = CL_R;
141 pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
142 CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
144 for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
146 pixel_format.image_channel_order = CL_RGBA;
147 pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
148 CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height );
149 width >>= 1;
150 height >>= 1;
152 CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY, mb_count * sizeof(int16_t) );
153 CREATEBUF( fenc->opencl.intra_cost, CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) );
154 CREATEBUF( fenc->opencl.lowres_mvs0, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
155 CREATEBUF( fenc->opencl.lowres_mvs1, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
156 CREATEBUF( fenc->opencl.lowres_mv_costs0, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
157 CREATEBUF( fenc->opencl.lowres_mv_costs1, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
159 #undef CREATEBUF
160 #undef CREATEIMAGE
162 /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
164 char *locked = x264_opencl_alloc_locked( h, luma_length );
165 memcpy( locked, fenc->plane[0], luma_length );
166 OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL );
168 size_t gdim[2];
169 if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor )
171 int size = h->mb.i_mb_count * sizeof(int16_t);
172 locked = x264_opencl_alloc_locked( h, size );
173 memcpy( locked, fenc->i_inv_qscale_factor, size );
174 OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL );
176 else
178 /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
179 cl_uint arg = 0;
180 int16_t value = 256;
181 OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
182 OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value );
183 gdim[0] = h->mb.i_mb_count;
184 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL );
187 int stride = fenc->i_stride[0];
188 cl_uint arg = 0;
189 OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] );
190 OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
191 OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel );
192 OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride );
193 gdim[0] = 8 * h->mb.i_mb_width;
194 gdim[1] = 8 * h->mb.i_mb_height;
195 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
197 for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ )
199 /* Workaround for AMD Southern Island:
201 * Alternate kernel instances. No perf impact to this, so we do it for
202 * all GPUs. It prevents the same kernel from being enqueued
203 * back-to-back, avoiding a dependency calculation bug in the driver.
205 cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2;
207 arg = 0;
208 OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] );
209 OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] );
210 gdim[0] >>= 1;
211 gdim[1] >>= 1;
212 if( gdim[0] < 16 || gdim[1] < 16 )
213 break;
214 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL );
217 size_t ldim[2];
218 gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5;
219 gdim[1] = 8*h->mb.i_mb_height;
220 ldim[0] = 32;
221 ldim[1] = 8;
222 arg = 0;
224 /* For presets slow, slower, and placebo, check all 10 intra modes that the
225 * C lookahead supports. For faster presets, only check the most frequent 8
226 * modes
228 int slow = h->param.analyse.i_subpel_refine > 7;
229 OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
230 OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
231 OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
232 OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda );
233 OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
234 OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow );
235 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
237 gdim[0] = 256;
238 gdim[1] = h->mb.i_mb_height;
239 ldim[0] = 256;
240 ldim[1] = 1;
241 arg = 0;
242 OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
243 OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
244 OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
245 OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
246 OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
247 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
249 if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
250 x264_opencl_flush( h );
252 int size = h->mb.i_mb_count * sizeof(int16_t);
253 locked = x264_opencl_alloc_locked( h, size );
254 OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL );
255 h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0];
256 h->opencl.copies[h->opencl.num_copies].src = locked;
257 h->opencl.copies[h->opencl.num_copies].bytes = size;
258 h->opencl.num_copies++;
260 size = h->mb.i_mb_height * sizeof(int);
261 locked = x264_opencl_alloc_locked( h, size );
262 OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
263 h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0];
264 h->opencl.copies[h->opencl.num_copies].src = locked;
265 h->opencl.copies[h->opencl.num_copies].bytes = size;
266 h->opencl.num_copies++;
268 size = sizeof(int) * 4;
269 locked = x264_opencl_alloc_locked( h, size );
270 OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
271 h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0];
272 h->opencl.copies[h->opencl.num_copies].src = locked;
273 h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
274 h->opencl.num_copies++;
275 h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0];
276 h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
277 h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
278 h->opencl.num_copies++;
280 h->opencl.last_buf = !h->opencl.last_buf;
281 return 0;
284 /* This function was tested emprically on a number of AMD and NV GPUs. Making a
285 * function which returns perfect launch dimensions is impossible; some
286 * applications will have self-tuning code to try many possible variables and
287 * measure the runtime. Here we simply make an educated guess based on what we
288 * know GPUs typically prefer. */
289 static void x264_optimal_launch_dims( x264_t *h, size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device )
291 x264_opencl_function_t *ocl = h->opencl.ocl;
292 size_t max_work_group = 256; /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */
293 size_t preferred_multiple = 64;
294 cl_uint num_cus = 6;
296 ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL );
297 ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL );
298 ocl->clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL );
300 ldims[0] = preferred_multiple;
301 ldims[1] = 8;
303 /* make ldims[1] an even divisor of gdims[1] */
304 while( gdims[1] & (ldims[1] - 1) )
306 ldims[0] <<= 1;
307 ldims[1] >>= 1;
309 /* make total ldims fit under the max work-group dimensions for the device */
310 while( ldims[0] * ldims[1] > max_work_group )
312 if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) )
313 ldims[1] >>= 1;
314 else
315 ldims[0] >>= 1;
318 if( ldims[0] > gdims[0] )
320 /* remove preferred multiples until we're close to gdims[0] */
321 while( gdims[0] + preferred_multiple < ldims[0] )
322 ldims[0] -= preferred_multiple;
323 gdims[0] = ldims[0];
325 else
327 /* make gdims an even multiple of ldims */
328 gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0];
329 gdims[0] *= ldims[0];
332 /* make ldims smaller to spread work across compute units */
333 while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus )
335 if( ldims[0] > preferred_multiple )
336 ldims[0] >>= 1;
337 else if( ldims[1] > 1 )
338 ldims[1] >>= 1;
339 else
340 break;
342 /* for smaller GPUs, try not to abuse their texture cache */
343 if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 )
344 ldims[0] = 32;
347 int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w )
349 x264_opencl_function_t *ocl = h->opencl.ocl;
350 x264_frame_t *fenc = frames[b];
351 x264_frame_t *fref = frames[ref];
353 cl_mem ref_scaled_images[NUM_IMAGE_SCALES];
354 cl_mem ref_luma_hpel;
355 cl_int status;
357 if( w && w->weightfn )
359 size_t gdims[2];
361 gdims[0] = 8 * h->mb.i_mb_width;
362 gdims[1] = 8 * h->mb.i_mb_height;
364 /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */
365 for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
367 cl_uint arg = 0;
368 OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] );
369 OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] );
370 OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset );
371 OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale );
372 OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom );
373 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
375 gdims[0] >>= 1;
376 gdims[1] >>= 1;
377 if( gdims[0] < 16 || gdims[1] < 16 )
378 break;
381 cl_uint arg = 0;
382 gdims[0] = 8 * h->mb.i_mb_width;
383 gdims[1] = 8 * h->mb.i_mb_height;
385 OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel );
386 OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel );
387 OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset );
388 OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale );
389 OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom );
390 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
392 /* Use weighted reference planes for motion search */
393 for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
394 ref_scaled_images[i] = h->opencl.weighted_scaled_images[i];
395 ref_luma_hpel = h->opencl.weighted_luma_hpel;
397 else
399 /* Use unweighted reference planes for motion search */
400 for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
401 ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i];
402 ref_luma_hpel = fref->opencl.luma_hpel;
405 const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 };
406 int b_first_iteration = 1;
407 int b_reverse_references = 1;
408 int A = 1;
411 int mb_per_group = 0;
412 int cost_local_size = 0;
413 int mvc_local_size = 0;
414 int mb_width;
416 size_t gdims[2];
417 size_t ldims[2];
419 /* scale 0 is 8x8 */
420 for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- )
422 mb_width = h->mb.i_mb_width >> scale;
423 gdims[0] = mb_width;
424 gdims[1] = h->mb.i_mb_height >> scale;
425 if( gdims[0] < 2 || gdims[1] < 2 )
426 continue;
427 gdims[0] <<= 2;
428 x264_optimal_launch_dims( h, gdims, ldims, h->opencl.hme_kernel, h->opencl.device );
430 mb_per_group = (ldims[0] >> 2) * ldims[1];
431 cost_local_size = 4 * mb_per_group * sizeof(int16_t);
432 mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2;
433 int scaled_me_range = h->param.analyse.i_me_range >> scale;
434 int b_shift_index = 1;
436 cl_uint arg = 0;
437 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] );
438 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] );
439 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
440 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
441 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
442 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer );
443 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL );
444 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL );
445 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width );
446 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda );
447 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range );
448 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale );
449 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index );
450 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration );
451 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references );
453 for( int iter = 0; iter < num_iterations[scale]; iter++ )
455 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
457 b_shift_index = 0;
458 b_first_iteration = 0;
460 /* alternate top-left vs bot-right MB references at lower scales, so
461 * motion field smooths more quickly. */
462 if( scale > 2 )
463 b_reverse_references ^= 1;
464 else
465 b_reverse_references = 0;
466 A = !A;
467 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
468 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
469 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index );
470 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration );
471 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references );
475 int satd_local_size = mb_per_group * sizeof(uint32_t) * 16;
476 cl_uint arg = 0;
477 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
478 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel );
479 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
480 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
481 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL );
482 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL );
483 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL );
485 if( b_islist1 )
487 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
488 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
490 else
492 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
493 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
496 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width );
497 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda );
498 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b );
499 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref );
500 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 );
502 if( h->opencl.b_device_AMD_SI )
504 /* workaround for AMD Southern Island driver scheduling bug (fixed in
505 * July 2012), perform meaningless small copy to add a data dependency */
506 OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL );
509 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
511 int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count;
513 if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 )
514 x264_opencl_flush( h );
516 char *locked = x264_opencl_alloc_locked( h, mvlen );
517 h->opencl.copies[h->opencl.num_copies].src = locked;
518 h->opencl.copies[h->opencl.num_copies].bytes = mvlen;
520 if( b_islist1 )
522 int mvs_offset = mvlen * (ref - b - 1);
523 OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
524 h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1];
526 else
528 int mvs_offset = mvlen * (b - ref - 1);
529 OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
530 h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1];
533 h->opencl.num_copies++;
535 return 0;
538 int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor )
540 x264_opencl_function_t *ocl = h->opencl.ocl;
541 cl_int status;
542 x264_frame_t *fenc = frames[b];
543 x264_frame_t *fref0 = frames[p0];
544 x264_frame_t *fref1 = frames[p1];
546 int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32;
548 /* Tasks for this kernel:
549 * 1. Select least cost mode (intra, ref0, ref1)
550 * list_used 0, 1, 2, or 3. if B frame, do not allow intra
551 * 2. if B frame, try bidir predictions.
552 * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); */
553 size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height };
554 size_t ldim_bidir[2];
555 size_t *ldims = NULL;
556 int cost_local_size = 4;
557 int satd_local_size = 4;
558 if( b < p1 )
560 /* For B frames, use 4 threads per MB for BIDIR checks */
561 ldims = ldim_bidir;
562 gdims[0] <<= 2;
563 x264_optimal_launch_dims( h, gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device );
564 int mb_per_group = (ldims[0] >> 2) * ldims[1];
565 cost_local_size = 4 * mb_per_group * sizeof(int16_t);
566 satd_local_size = 16 * mb_per_group * sizeof(uint32_t);
569 cl_uint arg = 0;
570 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
571 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel );
572 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel );
573 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
574 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
575 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 );
576 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
577 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
578 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
579 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
580 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
581 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL );
582 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL );
583 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
584 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight );
585 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor );
586 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b );
587 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 );
588 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 );
589 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda );
590 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
592 /* Sum costs across rows, atomicAdd down frame */
593 size_t gdim[2] = { 256, h->mb.i_mb_height };
594 size_t ldim[2] = { 256, 1 };
596 arg = 0;
597 OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
598 OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
599 OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
600 OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
601 OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
602 OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias );
603 OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b );
604 OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 );
605 OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 );
606 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
608 if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
609 x264_opencl_flush( h );
611 int size = h->mb.i_mb_count * sizeof(int16_t);
612 char *locked = x264_opencl_alloc_locked( h, size );
613 h->opencl.copies[h->opencl.num_copies].src = locked;
614 h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b];
615 h->opencl.copies[h->opencl.num_copies].bytes = size;
616 OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
617 h->opencl.num_copies++;
619 size = h->mb.i_mb_height * sizeof(int);
620 locked = x264_opencl_alloc_locked( h, size );
621 h->opencl.copies[h->opencl.num_copies].src = locked;
622 h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b];
623 h->opencl.copies[h->opencl.num_copies].bytes = size;
624 OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
625 h->opencl.num_copies++;
627 size = 4 * sizeof(int);
628 locked = x264_opencl_alloc_locked( h, size );
629 OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
630 h->opencl.last_buf = !h->opencl.last_buf;
632 h->opencl.copies[h->opencl.num_copies].src = locked;
633 h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b];
634 h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
635 h->opencl.num_copies++;
636 h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
637 h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b];
638 h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
639 h->opencl.num_copies++;
641 if( b == p1 ) // P frames only
643 h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int);
644 h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0];
645 h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
646 h->opencl.num_copies++;
648 return 0;
651 void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda )
653 if( h->param.b_opencl )
655 #ifdef _WIN32
656 /* Temporarily boost priority of this lookahead thread and the OpenCL
657 * driver's thread until the end of this function. On AMD GPUs this
658 * greatly reduces the latency of enqueuing kernels and getting results
659 * on Windows. */
660 HANDLE id = GetCurrentThread();
661 h->opencl.lookahead_thread_pri = GetThreadPriority( id );
662 SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
663 x264_opencl_function_t *ocl = h->opencl.ocl;
664 cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
665 if( status == CL_SUCCESS )
667 h->opencl.opencl_thread_pri = GetThreadPriority( id );
668 SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
670 #endif
672 /* precalculate intra and I frames */
673 for( int i = 0; i <= num_frames; i++ )
674 x264_opencl_lowres_init( h, frames[i], lambda );
675 x264_opencl_flush( h );
677 if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && h->param.i_bframe )
679 /* For trellis B-Adapt, precompute exhaustive motion searches */
680 for( int b = 0; b <= num_frames; b++ )
682 for( int j = 1; j < h->param.i_bframe; j++ )
684 int p0 = b - j;
685 if( p0 >= 0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF )
687 const x264_weight_t *w = x264_weight_none;
689 if( h->param.analyse.i_weighted_pred )
691 x264_emms();
692 x264_weights_analyse( h, frames[b], frames[p0], 1 );
693 w = frames[b]->weight[0];
695 frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
696 x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
698 int p1 = b + j;
699 if( p1 <= num_frames && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF )
701 frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
702 x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
707 x264_opencl_flush( h );
713 void x264_opencl_slicetype_end( x264_t *h )
715 #ifdef _WIN32
716 if( h->param.b_opencl )
718 HANDLE id = GetCurrentThread();
719 SetThreadPriority( id, h->opencl.lookahead_thread_pri );
720 x264_opencl_function_t *ocl = h->opencl.ocl;
721 cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
722 if( status == CL_SUCCESS )
723 SetThreadPriority( id, h->opencl.opencl_thread_pri );
725 #endif
728 int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b )
730 if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) )
731 return 0;
732 else
734 int do_search[2];
735 int dist_scale_factor = 128;
736 const x264_weight_t *w = x264_weight_none;
738 // avoid duplicating work
739 frames[b]->i_cost_est[b-p0][p1-b] = 0;
741 do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
742 do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
743 if( do_search[0] )
745 if( h->param.analyse.i_weighted_pred && b == p1 )
747 x264_emms();
748 x264_weights_analyse( h, frames[b], frames[p0], 1 );
749 w = frames[b]->weight[0];
751 frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
753 if( do_search[1] )
754 frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
755 if( b == p1 )
756 frames[b]->i_intra_mbs[b-p0] = 0;
757 if( p1 != p0 )
758 dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
760 frames[b]->i_cost_est[b-p0][p1-b] = 0;
761 frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
763 x264_opencl_lowres_init( h, frames[b], lambda );
765 if( do_search[0] )
767 x264_opencl_lowres_init( h, frames[p0], lambda );
768 x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
770 if( do_search[1] )
772 x264_opencl_lowres_init( h, frames[p1], lambda );
773 x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
775 x264_opencl_finalize_cost( h, lambda, frames, p0, p1, b, dist_scale_factor );
776 return 1;
780 #endif