encoder/slicetype-cl.c

   1 /*****************************************************************************
   2  * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
   3  *****************************************************************************
   4  * Copyright (C) 2012-2017 x264 project
   5  *
   6  * Authors: Steve Borho <sborho@multicorewareinc.com>
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  21  *
  22  * This program is also available under a commercial proprietary license.
  23  * For more information, contact us at licensing@x264.com.
  24  *****************************************************************************/
  25
  26 #include "common/common.h"
  27 #include "macroblock.h"
  28 #include "me.h"
  29
  30 #if HAVE_OPENCL
  31 #ifdef _WIN32
  32 #include <windows.h>
  33 #endif
  34
  35 void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
  36
  37 /* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not defined
  38  * in the OpenCL headers shipped with NVIDIA drivers.  We need to be
  39  * able to compile on an NVIDIA machine and run optimally on an AMD GPU. */
  40 #define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
  41
  42 #define OCLCHECK( method, ... )\
  43 do\
  44 {\
  45     if( h->opencl.b_fatal_error )\
  46         return -1;\
  47     status = ocl->method( __VA_ARGS__ );\
  48     if( status != CL_SUCCESS ) {\
  49         h->param.b_opencl = 0;\
  50         h->opencl.b_fatal_error = 1;\
  51         x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status );\
  52         return -1;\
  53     }\
  54 } while( 0 )
  55
  56 void x264_opencl_flush( x264_t *h )
  57 {
  58     x264_opencl_function_t *ocl = h->opencl.ocl;
  59
  60     ocl->clFinish( h->opencl.queue );
  61
  62     /* Finish copies from the GPU by copying from the page-locked buffer to
  63      * their final destination */
  64     for( int i = 0; i < h->opencl.num_copies; i++ )
  65         memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes );
  66     h->opencl.num_copies = 0;
  67     h->opencl.pl_occupancy = 0;
  68 }
  69
  70 static void *x264_opencl_alloc_locked( x264_t *h, int bytes )
  71 {
  72     if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE )
  73         x264_opencl_flush( h );
  74     assert( bytes < PAGE_LOCKED_BUF_SIZE );
  75     char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy;
  76     h->opencl.pl_occupancy += bytes;
  77     return ptr;
  78 }
  79
  80 int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
  81 {
  82     if( fenc->b_intra_calculated )
  83         return 0;
  84     fenc->b_intra_calculated = 1;
  85
  86     x264_opencl_function_t *ocl = h->opencl.ocl;
  87     int luma_length = fenc->i_stride[0] * fenc->i_lines[0];
  88
  89 #define CREATEBUF( out, flags, size )\
  90     out = ocl->clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
  91     if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
  92 #define CREATEIMAGE( out, flags, pf, width, height )\
  93     out = ocl->clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
  94     if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
  95
  96     int mb_count = h->mb.i_mb_count;
  97     cl_int status;
  98
  99     if( !h->opencl.lowres_mv_costs )
 100     {
 101         /* Allocate shared memory buffers */
 102         int width = h->mb.i_mb_width * 8 * sizeof(pixel);
 103         int height = h->mb.i_mb_height * 8 * sizeof(pixel);
 104
 105         cl_image_format pixel_format;
 106         pixel_format.image_channel_order = CL_R;
 107         pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
 108         CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
 109
 110         for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
 111         {
 112             pixel_format.image_channel_order = CL_RGBA;
 113             pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
 114             CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height );
 115             width >>= 1;
 116             height >>= 1;
 117         }
 118
 119         CREATEBUF( h->opencl.lowres_mv_costs,     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
 120         CREATEBUF( h->opencl.lowres_costs[0],     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
 121         CREATEBUF( h->opencl.lowres_costs[1],     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
 122         CREATEBUF( h->opencl.mv_buffers[0],       CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
 123         CREATEBUF( h->opencl.mv_buffers[1],       CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
 124         CREATEBUF( h->opencl.mvp_buffer,          CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
 125         CREATEBUF( h->opencl.frame_stats[0],      CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
 126         CREATEBUF( h->opencl.frame_stats[1],      CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
 127         CREATEBUF( h->opencl.row_satds[0],        CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
 128         CREATEBUF( h->opencl.row_satds[1],        CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
 129         CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY,  luma_length );
 130         CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY,  luma_length );
 131     }
 132
 133     if( !fenc->opencl.intra_cost )
 134     {
 135         /* Allocate per-frame buffers */
 136         int width = h->mb.i_mb_width * 8 * sizeof(pixel);
 137         int height = h->mb.i_mb_height * 8 * sizeof(pixel);
 138
 139         cl_image_format pixel_format;
 140         pixel_format.image_channel_order = CL_R;
 141         pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
 142         CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
 143
 144         for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
 145         {
 146             pixel_format.image_channel_order = CL_RGBA;
 147             pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
 148             CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height );
 149             width >>= 1;
 150             height >>= 1;
 151         }
 152         CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY,  mb_count * sizeof(int16_t) );
 153         CREATEBUF( fenc->opencl.intra_cost,        CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) );
 154         CREATEBUF( fenc->opencl.lowres_mvs0,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
 155         CREATEBUF( fenc->opencl.lowres_mvs1,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
 156         CREATEBUF( fenc->opencl.lowres_mv_costs0,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
 157         CREATEBUF( fenc->opencl.lowres_mv_costs1,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
 158     }
 159 #undef CREATEBUF
 160 #undef CREATEIMAGE
 161
 162     /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
 163
 164     char *locked = x264_opencl_alloc_locked( h, luma_length );
 165     memcpy( locked, fenc->plane[0], luma_length );
 166     OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue,  h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL );
 167
 168     size_t gdim[2];
 169     if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor )
 170     {
 171         int size = h->mb.i_mb_count * sizeof(int16_t);
 172         locked = x264_opencl_alloc_locked( h, size );
 173         memcpy( locked, fenc->i_inv_qscale_factor, size );
 174         OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL );
 175     }
 176     else
 177     {
 178         /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
 179         cl_uint arg = 0;
 180         int16_t value = 256;
 181         OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
 182         OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value );
 183         gdim[0] = h->mb.i_mb_count;
 184         OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL );
 185     }
 186
 187     int stride = fenc->i_stride[0];
 188     cl_uint arg = 0;
 189     OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] );
 190     OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
 191     OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel );
 192     OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride );
 193     gdim[0] = 8 * h->mb.i_mb_width;
 194     gdim[1] = 8 * h->mb.i_mb_height;
 195     OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
 196
 197     for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ )
 198     {
 199         /* Workaround for AMD Southern Island:
 200          *
 201          * Alternate kernel instances.  No perf impact to this, so we do it for
 202          * all GPUs.  It prevents the same kernel from being enqueued
 203          * back-to-back, avoiding a dependency calculation bug in the driver.
 204          */
 205         cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2;
 206
 207         arg = 0;
 208         OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] );
 209         OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] );
 210         gdim[0] >>= 1;
 211         gdim[1] >>= 1;
 212         if( gdim[0] < 16 || gdim[1] < 16 )
 213             break;
 214         OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL );
 215     }
 216
 217     size_t ldim[2];
 218     gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5;
 219     gdim[1] = 8*h->mb.i_mb_height;
 220     ldim[0] = 32;
 221     ldim[1] = 8;
 222     arg = 0;
 223
 224     /* For presets slow, slower, and placebo, check all 10 intra modes that the
 225      * C lookahead supports.  For faster presets, only check the most frequent 8
 226      * modes
 227      */
 228     int slow = h->param.analyse.i_subpel_refine > 7;
 229     OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
 230     OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
 231     OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
 232     OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda );
 233     OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
 234     OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow );
 235     OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
 236
 237     gdim[0] = 256;
 238     gdim[1] = h->mb.i_mb_height;
 239     ldim[0] = 256;
 240     ldim[1] = 1;
 241     arg = 0;
 242     OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
 243     OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
 244     OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
 245     OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
 246     OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
 247     OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
 248
 249     if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
 250         x264_opencl_flush( h );
 251
 252     int size = h->mb.i_mb_count * sizeof(int16_t);
 253     locked = x264_opencl_alloc_locked( h, size );
 254     OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL );
 255     h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0];
 256     h->opencl.copies[h->opencl.num_copies].src = locked;
 257     h->opencl.copies[h->opencl.num_copies].bytes = size;
 258     h->opencl.num_copies++;
 259
 260     size = h->mb.i_mb_height * sizeof(int);
 261     locked = x264_opencl_alloc_locked( h, size );
 262     OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
 263     h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0];
 264     h->opencl.copies[h->opencl.num_copies].src = locked;
 265     h->opencl.copies[h->opencl.num_copies].bytes = size;
 266     h->opencl.num_copies++;
 267
 268     size = sizeof(int) * 4;
 269     locked = x264_opencl_alloc_locked( h, size );
 270     OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
 271     h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0];
 272     h->opencl.copies[h->opencl.num_copies].src = locked;
 273     h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
 274     h->opencl.num_copies++;
 275     h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0];
 276     h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
 277     h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
 278     h->opencl.num_copies++;
 279
 280     h->opencl.last_buf = !h->opencl.last_buf;
 281     return 0;
 282 }
 283
 284 /* This function was tested emprically on a number of AMD and NV GPUs.  Making a
 285  * function which returns perfect launch dimensions is impossible; some
 286  * applications will have self-tuning code to try many possible variables and
 287  * measure the runtime.  Here we simply make an educated guess based on what we
 288  * know GPUs typically prefer.  */
 289 static void x264_optimal_launch_dims( x264_t *h, size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device )
 290 {
 291     x264_opencl_function_t *ocl = h->opencl.ocl;
 292     size_t max_work_group = 256;    /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */
 293     size_t preferred_multiple = 64;
 294     cl_uint num_cus = 6;
 295
 296     ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL );
 297     ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL );
 298     ocl->clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL );
 299
 300     ldims[0] = preferred_multiple;
 301     ldims[1] = 8;
 302
 303     /* make ldims[1] an even divisor of gdims[1] */
 304     while( gdims[1] & (ldims[1] - 1) )
 305     {
 306         ldims[0] <<= 1;
 307         ldims[1] >>= 1;
 308     }
 309     /* make total ldims fit under the max work-group dimensions for the device */
 310     while( ldims[0] * ldims[1] > max_work_group )
 311     {
 312         if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) )
 313             ldims[1] >>= 1;
 314         else
 315             ldims[0] >>= 1;
 316     }
 317
 318     if( ldims[0] > gdims[0] )
 319     {
 320         /* remove preferred multiples until we're close to gdims[0] */
 321         while( gdims[0] + preferred_multiple < ldims[0] )
 322             ldims[0] -= preferred_multiple;
 323         gdims[0] = ldims[0];
 324     }
 325     else
 326     {
 327         /* make gdims an even multiple of ldims */
 328         gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0];
 329         gdims[0] *= ldims[0];
 330     }
 331
 332     /* make ldims smaller to spread work across compute units */
 333     while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus )
 334     {
 335         if( ldims[0] > preferred_multiple )
 336             ldims[0] >>= 1;
 337         else if( ldims[1] > 1 )
 338             ldims[1] >>= 1;
 339         else
 340             break;
 341     }
 342     /* for smaller GPUs, try not to abuse their texture cache */
 343     if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 )
 344         ldims[0] = 32;
 345 }
 346
 347 int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w )
 348 {
 349     x264_opencl_function_t *ocl = h->opencl.ocl;
 350     x264_frame_t *fenc = frames[b];
 351     x264_frame_t *fref = frames[ref];
 352
 353     cl_mem ref_scaled_images[NUM_IMAGE_SCALES];
 354     cl_mem ref_luma_hpel;
 355     cl_int status;
 356
 357     if( w && w->weightfn )
 358     {
 359         size_t gdims[2];
 360
 361         gdims[0] = 8 * h->mb.i_mb_width;
 362         gdims[1] = 8 * h->mb.i_mb_height;
 363
 364         /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */
 365         for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
 366         {
 367             cl_uint arg = 0;
 368             OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] );
 369             OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] );
 370             OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset );
 371             OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale );
 372             OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom );
 373             OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
 374
 375             gdims[0] >>= 1;
 376             gdims[1] >>= 1;
 377             if( gdims[0] < 16 || gdims[1] < 16 )
 378                 break;
 379         }
 380
 381         cl_uint arg = 0;
 382         gdims[0] = 8 * h->mb.i_mb_width;
 383         gdims[1] = 8 * h->mb.i_mb_height;
 384
 385         OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel );
 386         OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel );
 387         OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset );
 388         OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale );
 389         OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom );
 390         OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
 391
 392         /* Use weighted reference planes for motion search */
 393         for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
 394             ref_scaled_images[i] = h->opencl.weighted_scaled_images[i];
 395         ref_luma_hpel = h->opencl.weighted_luma_hpel;
 396     }
 397     else
 398     {
 399         /* Use unweighted reference planes for motion search */
 400         for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
 401             ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i];
 402         ref_luma_hpel = fref->opencl.luma_hpel;
 403     }
 404
 405     const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 };
 406     int b_first_iteration = 1;
 407     int b_reverse_references = 1;
 408     int A = 1;
 409
 410
 411     int mb_per_group = 0;
 412     int cost_local_size = 0;
 413     int mvc_local_size = 0;
 414     int mb_width;
 415
 416     size_t gdims[2];
 417     size_t ldims[2];
 418
 419     /* scale 0 is 8x8 */
 420     for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- )
 421     {
 422         mb_width = h->mb.i_mb_width >> scale;
 423         gdims[0] = mb_width;
 424         gdims[1] = h->mb.i_mb_height >> scale;
 425         if( gdims[0] < 2 || gdims[1] < 2 )
 426             continue;
 427         gdims[0] <<= 2;
 428         x264_optimal_launch_dims( h, gdims, ldims, h->opencl.hme_kernel, h->opencl.device );
 429
 430         mb_per_group = (ldims[0] >> 2) * ldims[1];
 431         cost_local_size = 4 * mb_per_group * sizeof(int16_t);
 432         mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2;
 433         int scaled_me_range = h->param.analyse.i_me_range >> scale;
 434         int b_shift_index = 1;
 435
 436         cl_uint arg = 0;
 437         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] );
 438         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] );
 439         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
 440         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
 441         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
 442         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer );
 443         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL );
 444         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL );
 445         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width );
 446         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda );
 447         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range );
 448         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale );
 449         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index );
 450         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration );
 451         OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references );
 452
 453         for( int iter = 0; iter < num_iterations[scale]; iter++ )
 454         {
 455             OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
 456
 457             b_shift_index = 0;
 458             b_first_iteration = 0;
 459
 460             /* alternate top-left vs bot-right MB references at lower scales, so
 461              * motion field smooths more quickly.  */
 462             if( scale > 2 )
 463                 b_reverse_references ^= 1;
 464             else
 465                 b_reverse_references = 0;
 466             A = !A;
 467             OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
 468             OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
 469             OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index );
 470             OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration );
 471             OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references );
 472         }
 473     }
 474
 475     int satd_local_size = mb_per_group * sizeof(uint32_t) * 16;
 476     cl_uint arg = 0;
 477     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
 478     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel );
 479     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
 480     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
 481     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL );
 482     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL );
 483     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL );
 484
 485     if( b_islist1 )
 486     {
 487         OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
 488         OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
 489     }
 490     else
 491     {
 492         OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
 493         OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
 494     }
 495
 496     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width );
 497     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda );
 498     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b );
 499     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref );
 500     OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 );
 501
 502     if( h->opencl.b_device_AMD_SI )
 503     {
 504         /* workaround for AMD Southern Island driver scheduling bug (fixed in
 505          * July 2012), perform meaningless small copy to add a data dependency */
 506         OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL );
 507     }
 508
 509     OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
 510
 511     int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count;
 512
 513     if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 )
 514         x264_opencl_flush( h );
 515
 516     char *locked = x264_opencl_alloc_locked( h, mvlen );
 517     h->opencl.copies[h->opencl.num_copies].src = locked;
 518     h->opencl.copies[h->opencl.num_copies].bytes = mvlen;
 519
 520     if( b_islist1 )
 521     {
 522         int mvs_offset = mvlen * (ref - b - 1);
 523         OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
 524         h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1];
 525     }
 526     else
 527     {
 528         int mvs_offset = mvlen * (b - ref - 1);
 529         OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
 530         h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1];
 531     }
 532
 533     h->opencl.num_copies++;
 534
 535     return 0;
 536 }
 537
 538 int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor )
 539 {
 540     x264_opencl_function_t *ocl = h->opencl.ocl;
 541     cl_int status;
 542     x264_frame_t *fenc = frames[b];
 543     x264_frame_t *fref0 = frames[p0];
 544     x264_frame_t *fref1 = frames[p1];
 545
 546     int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32;
 547
 548     /* Tasks for this kernel:
 549      * 1. Select least cost mode (intra, ref0, ref1)
 550      *    list_used 0, 1, 2, or 3.  if B frame, do not allow intra
 551      * 2. if B frame, try bidir predictions.
 552      * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); */
 553     size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height };
 554     size_t ldim_bidir[2];
 555     size_t *ldims = NULL;
 556     int cost_local_size = 4;
 557     int satd_local_size = 4;
 558     if( b < p1 )
 559     {
 560         /* For B frames, use 4 threads per MB for BIDIR checks */
 561         ldims = ldim_bidir;
 562         gdims[0] <<= 2;
 563         x264_optimal_launch_dims( h, gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device );
 564         int mb_per_group = (ldims[0] >> 2) * ldims[1];
 565         cost_local_size = 4 * mb_per_group * sizeof(int16_t);
 566         satd_local_size = 16 * mb_per_group * sizeof(uint32_t);
 567     }
 568
 569     cl_uint arg = 0;
 570     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
 571     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel );
 572     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel );
 573     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
 574     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
 575     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 );
 576     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
 577     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
 578     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
 579     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
 580     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
 581     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL );
 582     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL );
 583     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
 584     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight );
 585     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor );
 586     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b );
 587     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 );
 588     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 );
 589     OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda );
 590     OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
 591
 592     /* Sum costs across rows, atomicAdd down frame */
 593     size_t gdim[2] = { 256, h->mb.i_mb_height };
 594     size_t ldim[2] = { 256, 1 };
 595
 596     arg = 0;
 597     OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
 598     OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
 599     OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
 600     OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
 601     OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
 602     OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias );
 603     OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b );
 604     OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 );
 605     OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 );
 606     OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
 607
 608     if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
 609         x264_opencl_flush( h );
 610
 611     int size =  h->mb.i_mb_count * sizeof(int16_t);
 612     char *locked = x264_opencl_alloc_locked( h, size );
 613     h->opencl.copies[h->opencl.num_copies].src = locked;
 614     h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b];
 615     h->opencl.copies[h->opencl.num_copies].bytes = size;
 616     OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
 617     h->opencl.num_copies++;
 618
 619     size =  h->mb.i_mb_height * sizeof(int);
 620     locked = x264_opencl_alloc_locked( h, size );
 621     h->opencl.copies[h->opencl.num_copies].src = locked;
 622     h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b];
 623     h->opencl.copies[h->opencl.num_copies].bytes = size;
 624     OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
 625     h->opencl.num_copies++;
 626
 627     size =  4 * sizeof(int);
 628     locked = x264_opencl_alloc_locked( h, size );
 629     OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
 630     h->opencl.last_buf = !h->opencl.last_buf;
 631
 632     h->opencl.copies[h->opencl.num_copies].src = locked;
 633     h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b];
 634     h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
 635     h->opencl.num_copies++;
 636     h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
 637     h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b];
 638     h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
 639     h->opencl.num_copies++;
 640
 641     if( b == p1 ) // P frames only
 642     {
 643         h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int);
 644         h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0];
 645         h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
 646         h->opencl.num_copies++;
 647     }
 648     return 0;
 649 }
 650
 651 void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda )
 652 {
 653     if( h->param.b_opencl )
 654     {
 655 #ifdef _WIN32
 656         /* Temporarily boost priority of this lookahead thread and the OpenCL
 657          * driver's thread until the end of this function.  On AMD GPUs this
 658          * greatly reduces the latency of enqueuing kernels and getting results
 659          * on Windows. */
 660         HANDLE id = GetCurrentThread();
 661         h->opencl.lookahead_thread_pri = GetThreadPriority( id );
 662         SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
 663         x264_opencl_function_t *ocl = h->opencl.ocl;
 664         cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
 665         if( status == CL_SUCCESS )
 666         {
 667             h->opencl.opencl_thread_pri = GetThreadPriority( id );
 668             SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
 669         }
 670 #endif
 671
 672         /* precalculate intra and I frames */
 673         for( int i = 0; i <= num_frames; i++ )
 674             x264_opencl_lowres_init( h, frames[i], lambda );
 675         x264_opencl_flush( h );
 676
 677         if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && h->param.i_bframe )
 678         {
 679             /* For trellis B-Adapt, precompute exhaustive motion searches */
 680             for( int b = 0; b <= num_frames; b++ )
 681             {
 682                 for( int j = 1; j < h->param.i_bframe; j++ )
 683                 {
 684                     int p0 = b - j;
 685                     if( p0 >= 0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF )
 686                     {
 687                         const x264_weight_t *w = x264_weight_none;
 688
 689                         if( h->param.analyse.i_weighted_pred )
 690                         {
 691                             x264_emms();
 692                             x264_weights_analyse( h, frames[b], frames[p0], 1 );
 693                             w = frames[b]->weight[0];
 694                         }
 695                         frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
 696                         x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
 697                     }
 698                     int p1 = b + j;
 699                     if( p1 <= num_frames && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF )
 700                     {
 701                         frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
 702                         x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
 703                     }
 704                 }
 705             }
 706
 707             x264_opencl_flush( h );
 708         }
 709     }
 710 }
 711
 712
 713 void x264_opencl_slicetype_end( x264_t *h )
 714 {
 715 #ifdef _WIN32
 716     if( h->param.b_opencl )
 717     {
 718         HANDLE id = GetCurrentThread();
 719         SetThreadPriority( id, h->opencl.lookahead_thread_pri );
 720         x264_opencl_function_t *ocl = h->opencl.ocl;
 721         cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
 722         if( status == CL_SUCCESS )
 723             SetThreadPriority( id, h->opencl.opencl_thread_pri );
 724     }
 725 #endif
 726 }
 727
 728 int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b )
 729 {
 730     if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) )
 731         return 0;
 732     else
 733     {
 734         int do_search[2];
 735         int dist_scale_factor = 128;
 736         const x264_weight_t *w = x264_weight_none;
 737
 738         // avoid duplicating work
 739         frames[b]->i_cost_est[b-p0][p1-b] = 0;
 740
 741         do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
 742         do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
 743         if( do_search[0] )
 744         {
 745             if( h->param.analyse.i_weighted_pred && b == p1 )
 746             {
 747                 x264_emms();
 748                 x264_weights_analyse( h, frames[b], frames[p0], 1 );
 749                 w = frames[b]->weight[0];
 750             }
 751             frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
 752         }
 753         if( do_search[1] )
 754             frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
 755         if( b == p1 )
 756             frames[b]->i_intra_mbs[b-p0] = 0;
 757         if( p1 != p0 )
 758             dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
 759
 760         frames[b]->i_cost_est[b-p0][p1-b] = 0;
 761         frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
 762
 763         x264_opencl_lowres_init( h, frames[b], lambda );
 764
 765         if( do_search[0] )
 766         {
 767             x264_opencl_lowres_init( h, frames[p0], lambda );
 768             x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
 769         }
 770         if( do_search[1] )
 771         {
 772             x264_opencl_lowres_init( h, frames[p1], lambda );
 773             x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
 774         }
 775         x264_opencl_finalize_cost( h, lambda, frames, p0, p1, b, dist_scale_factor );
 776         return 1;
 777     }
 778 }
 779
 780 #endif