1 /*****************************************************************************
2 * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
3 *****************************************************************************
4 * Copyright (C) 2012-2017 x264 project
6 * Authors: Steve Borho <sborho@multicorewareinc.com>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at licensing@x264.com.
24 *****************************************************************************/
26 #include "common/common.h"
27 #include "macroblock.h"
35 void x264_weights_analyse( x264_t
*h
, x264_frame_t
*fenc
, x264_frame_t
*ref
, int b_lookahead
);
37 /* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not defined
38 * in the OpenCL headers shipped with NVIDIA drivers. We need to be
39 * able to compile on an NVIDIA machine and run optimally on an AMD GPU. */
40 #define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
42 #define OCLCHECK( method, ... )\
45 if( h->opencl.b_fatal_error )\
47 status = ocl->method( __VA_ARGS__ );\
48 if( status != CL_SUCCESS ) {\
49 h->param.b_opencl = 0;\
50 h->opencl.b_fatal_error = 1;\
51 x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status );\
56 void x264_opencl_flush( x264_t
*h
)
58 x264_opencl_function_t
*ocl
= h
->opencl
.ocl
;
60 ocl
->clFinish( h
->opencl
.queue
);
62 /* Finish copies from the GPU by copying from the page-locked buffer to
63 * their final destination */
64 for( int i
= 0; i
< h
->opencl
.num_copies
; i
++ )
65 memcpy( h
->opencl
.copies
[i
].dest
, h
->opencl
.copies
[i
].src
, h
->opencl
.copies
[i
].bytes
);
66 h
->opencl
.num_copies
= 0;
67 h
->opencl
.pl_occupancy
= 0;
70 static void *x264_opencl_alloc_locked( x264_t
*h
, int bytes
)
72 if( h
->opencl
.pl_occupancy
+ bytes
>= PAGE_LOCKED_BUF_SIZE
)
73 x264_opencl_flush( h
);
74 assert( bytes
< PAGE_LOCKED_BUF_SIZE
);
75 char *ptr
= h
->opencl
.page_locked_ptr
+ h
->opencl
.pl_occupancy
;
76 h
->opencl
.pl_occupancy
+= bytes
;
80 int x264_opencl_lowres_init( x264_t
*h
, x264_frame_t
*fenc
, int lambda
)
82 if( fenc
->b_intra_calculated
)
84 fenc
->b_intra_calculated
= 1;
86 x264_opencl_function_t
*ocl
= h
->opencl
.ocl
;
87 int luma_length
= fenc
->i_stride
[0] * fenc
->i_lines
[0];
89 #define CREATEBUF( out, flags, size )\
90 out = ocl->clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
91 if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
92 #define CREATEIMAGE( out, flags, pf, width, height )\
93 out = ocl->clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
94 if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
96 int mb_count
= h
->mb
.i_mb_count
;
99 if( !h
->opencl
.lowres_mv_costs
)
101 /* Allocate shared memory buffers */
102 int width
= h
->mb
.i_mb_width
* 8 * sizeof(pixel
);
103 int height
= h
->mb
.i_mb_height
* 8 * sizeof(pixel
);
105 cl_image_format pixel_format
;
106 pixel_format
.image_channel_order
= CL_R
;
107 pixel_format
.image_channel_data_type
= CL_UNSIGNED_INT32
;
108 CREATEIMAGE( h
->opencl
.weighted_luma_hpel
, CL_MEM_READ_WRITE
, pixel_format
, width
, height
);
110 for( int i
= 0; i
< NUM_IMAGE_SCALES
; i
++ )
112 pixel_format
.image_channel_order
= CL_RGBA
;
113 pixel_format
.image_channel_data_type
= CL_UNSIGNED_INT8
;
114 CREATEIMAGE( h
->opencl
.weighted_scaled_images
[i
], CL_MEM_READ_WRITE
, pixel_format
, width
, height
);
119 CREATEBUF( h
->opencl
.lowres_mv_costs
, CL_MEM_READ_WRITE
, mb_count
* sizeof(int16_t) );
120 CREATEBUF( h
->opencl
.lowres_costs
[0], CL_MEM_READ_WRITE
, mb_count
* sizeof(int16_t) );
121 CREATEBUF( h
->opencl
.lowres_costs
[1], CL_MEM_READ_WRITE
, mb_count
* sizeof(int16_t) );
122 CREATEBUF( h
->opencl
.mv_buffers
[0], CL_MEM_READ_WRITE
, mb_count
* sizeof(int16_t) * 2 );
123 CREATEBUF( h
->opencl
.mv_buffers
[1], CL_MEM_READ_WRITE
, mb_count
* sizeof(int16_t) * 2 );
124 CREATEBUF( h
->opencl
.mvp_buffer
, CL_MEM_READ_WRITE
, mb_count
* sizeof(int16_t) * 2 );
125 CREATEBUF( h
->opencl
.frame_stats
[0], CL_MEM_WRITE_ONLY
, 4 * sizeof(int) );
126 CREATEBUF( h
->opencl
.frame_stats
[1], CL_MEM_WRITE_ONLY
, 4 * sizeof(int) );
127 CREATEBUF( h
->opencl
.row_satds
[0], CL_MEM_WRITE_ONLY
, h
->mb
.i_mb_height
* sizeof(int) );
128 CREATEBUF( h
->opencl
.row_satds
[1], CL_MEM_WRITE_ONLY
, h
->mb
.i_mb_height
* sizeof(int) );
129 CREATEBUF( h
->opencl
.luma_16x16_image
[0], CL_MEM_READ_ONLY
, luma_length
);
130 CREATEBUF( h
->opencl
.luma_16x16_image
[1], CL_MEM_READ_ONLY
, luma_length
);
133 if( !fenc
->opencl
.intra_cost
)
135 /* Allocate per-frame buffers */
136 int width
= h
->mb
.i_mb_width
* 8 * sizeof(pixel
);
137 int height
= h
->mb
.i_mb_height
* 8 * sizeof(pixel
);
139 cl_image_format pixel_format
;
140 pixel_format
.image_channel_order
= CL_R
;
141 pixel_format
.image_channel_data_type
= CL_UNSIGNED_INT32
;
142 CREATEIMAGE( fenc
->opencl
.luma_hpel
, CL_MEM_READ_WRITE
, pixel_format
, width
, height
);
144 for( int i
= 0; i
< NUM_IMAGE_SCALES
; i
++ )
146 pixel_format
.image_channel_order
= CL_RGBA
;
147 pixel_format
.image_channel_data_type
= CL_UNSIGNED_INT8
;
148 CREATEIMAGE( fenc
->opencl
.scaled_image2Ds
[i
], CL_MEM_READ_WRITE
, pixel_format
, width
, height
);
152 CREATEBUF( fenc
->opencl
.inv_qscale_factor
, CL_MEM_READ_ONLY
, mb_count
* sizeof(int16_t) );
153 CREATEBUF( fenc
->opencl
.intra_cost
, CL_MEM_WRITE_ONLY
, mb_count
* sizeof(int16_t) );
154 CREATEBUF( fenc
->opencl
.lowres_mvs0
, CL_MEM_READ_WRITE
, mb_count
* 2 * sizeof(int16_t) * (h
->param
.i_bframe
+ 1) );
155 CREATEBUF( fenc
->opencl
.lowres_mvs1
, CL_MEM_READ_WRITE
, mb_count
* 2 * sizeof(int16_t) * (h
->param
.i_bframe
+ 1) );
156 CREATEBUF( fenc
->opencl
.lowres_mv_costs0
, CL_MEM_READ_WRITE
, mb_count
* sizeof(int16_t) * (h
->param
.i_bframe
+ 1) );
157 CREATEBUF( fenc
->opencl
.lowres_mv_costs1
, CL_MEM_READ_WRITE
, mb_count
* sizeof(int16_t) * (h
->param
.i_bframe
+ 1) );
162 /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
164 char *locked
= x264_opencl_alloc_locked( h
, luma_length
);
165 memcpy( locked
, fenc
->plane
[0], luma_length
);
166 OCLCHECK( clEnqueueWriteBuffer
, h
->opencl
.queue
, h
->opencl
.luma_16x16_image
[h
->opencl
.last_buf
], CL_FALSE
, 0, luma_length
, locked
, 0, NULL
, NULL
);
169 if( h
->param
.rc
.i_aq_mode
&& fenc
->i_inv_qscale_factor
)
171 int size
= h
->mb
.i_mb_count
* sizeof(int16_t);
172 locked
= x264_opencl_alloc_locked( h
, size
);
173 memcpy( locked
, fenc
->i_inv_qscale_factor
, size
);
174 OCLCHECK( clEnqueueWriteBuffer
, h
->opencl
.queue
, fenc
->opencl
.inv_qscale_factor
, CL_FALSE
, 0, size
, locked
, 0, NULL
, NULL
);
178 /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
181 OCLCHECK( clSetKernelArg
, h
->opencl
.memset_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.inv_qscale_factor
);
182 OCLCHECK( clSetKernelArg
, h
->opencl
.memset_kernel
, arg
++, sizeof(int16_t), &value
);
183 gdim
[0] = h
->mb
.i_mb_count
;
184 OCLCHECK( clEnqueueNDRangeKernel
, h
->opencl
.queue
, h
->opencl
.memset_kernel
, 1, NULL
, gdim
, NULL
, 0, NULL
, NULL
);
187 int stride
= fenc
->i_stride
[0];
189 OCLCHECK( clSetKernelArg
, h
->opencl
.downscale_hpel_kernel
, arg
++, sizeof(cl_mem
), &h
->opencl
.luma_16x16_image
[h
->opencl
.last_buf
] );
190 OCLCHECK( clSetKernelArg
, h
->opencl
.downscale_hpel_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.scaled_image2Ds
[0] );
191 OCLCHECK( clSetKernelArg
, h
->opencl
.downscale_hpel_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.luma_hpel
);
192 OCLCHECK( clSetKernelArg
, h
->opencl
.downscale_hpel_kernel
, arg
++, sizeof(int), &stride
);
193 gdim
[0] = 8 * h
->mb
.i_mb_width
;
194 gdim
[1] = 8 * h
->mb
.i_mb_height
;
195 OCLCHECK( clEnqueueNDRangeKernel
, h
->opencl
.queue
, h
->opencl
.downscale_hpel_kernel
, 2, NULL
, gdim
, NULL
, 0, NULL
, NULL
);
197 for( int i
= 0; i
< NUM_IMAGE_SCALES
- 1; i
++ )
199 /* Workaround for AMD Southern Island:
201 * Alternate kernel instances. No perf impact to this, so we do it for
202 * all GPUs. It prevents the same kernel from being enqueued
203 * back-to-back, avoiding a dependency calculation bug in the driver.
205 cl_kernel kern
= i
& 1 ? h
->opencl
.downscale_kernel1
: h
->opencl
.downscale_kernel2
;
208 OCLCHECK( clSetKernelArg
, kern
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.scaled_image2Ds
[i
] );
209 OCLCHECK( clSetKernelArg
, kern
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.scaled_image2Ds
[i
+1] );
212 if( gdim
[0] < 16 || gdim
[1] < 16 )
214 OCLCHECK( clEnqueueNDRangeKernel
, h
->opencl
.queue
, kern
, 2, NULL
, gdim
, NULL
, 0, NULL
, NULL
);
218 gdim
[0] = ((h
->mb
.i_mb_width
+ 31)>>5)<<5;
219 gdim
[1] = 8*h
->mb
.i_mb_height
;
224 /* For presets slow, slower, and placebo, check all 10 intra modes that the
225 * C lookahead supports. For faster presets, only check the most frequent 8
228 int slow
= h
->param
.analyse
.i_subpel_refine
> 7;
229 OCLCHECK( clSetKernelArg
, h
->opencl
.intra_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.scaled_image2Ds
[0] );
230 OCLCHECK( clSetKernelArg
, h
->opencl
.intra_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.intra_cost
);
231 OCLCHECK( clSetKernelArg
, h
->opencl
.intra_kernel
, arg
++, sizeof(cl_mem
), &h
->opencl
.frame_stats
[h
->opencl
.last_buf
] );
232 OCLCHECK( clSetKernelArg
, h
->opencl
.intra_kernel
, arg
++, sizeof(int), &lambda
);
233 OCLCHECK( clSetKernelArg
, h
->opencl
.intra_kernel
, arg
++, sizeof(int), &h
->mb
.i_mb_width
);
234 OCLCHECK( clSetKernelArg
, h
->opencl
.intra_kernel
, arg
++, sizeof(int), &slow
);
235 OCLCHECK( clEnqueueNDRangeKernel
, h
->opencl
.queue
, h
->opencl
.intra_kernel
, 2, NULL
, gdim
, ldim
, 0, NULL
, NULL
);
238 gdim
[1] = h
->mb
.i_mb_height
;
242 OCLCHECK( clSetKernelArg
, h
->opencl
.rowsum_intra_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.intra_cost
);
243 OCLCHECK( clSetKernelArg
, h
->opencl
.rowsum_intra_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.inv_qscale_factor
);
244 OCLCHECK( clSetKernelArg
, h
->opencl
.rowsum_intra_kernel
, arg
++, sizeof(cl_mem
), &h
->opencl
.row_satds
[h
->opencl
.last_buf
] );
245 OCLCHECK( clSetKernelArg
, h
->opencl
.rowsum_intra_kernel
, arg
++, sizeof(cl_mem
), &h
->opencl
.frame_stats
[h
->opencl
.last_buf
] );
246 OCLCHECK( clSetKernelArg
, h
->opencl
.rowsum_intra_kernel
, arg
++, sizeof(int), &h
->mb
.i_mb_width
);
247 OCLCHECK( clEnqueueNDRangeKernel
, h
->opencl
.queue
, h
->opencl
.rowsum_intra_kernel
, 2, NULL
, gdim
, ldim
, 0, NULL
, NULL
);
249 if( h
->opencl
.num_copies
>= MAX_FINISH_COPIES
- 4 )
250 x264_opencl_flush( h
);
252 int size
= h
->mb
.i_mb_count
* sizeof(int16_t);
253 locked
= x264_opencl_alloc_locked( h
, size
);
254 OCLCHECK( clEnqueueReadBuffer
, h
->opencl
.queue
, fenc
->opencl
.intra_cost
, CL_FALSE
, 0, size
, locked
, 0, NULL
, NULL
);
255 h
->opencl
.copies
[h
->opencl
.num_copies
].dest
= fenc
->lowres_costs
[0][0];
256 h
->opencl
.copies
[h
->opencl
.num_copies
].src
= locked
;
257 h
->opencl
.copies
[h
->opencl
.num_copies
].bytes
= size
;
258 h
->opencl
.num_copies
++;
260 size
= h
->mb
.i_mb_height
* sizeof(int);
261 locked
= x264_opencl_alloc_locked( h
, size
);
262 OCLCHECK( clEnqueueReadBuffer
, h
->opencl
.queue
, h
->opencl
.row_satds
[h
->opencl
.last_buf
], CL_FALSE
, 0, size
, locked
, 0, NULL
, NULL
);
263 h
->opencl
.copies
[h
->opencl
.num_copies
].dest
= fenc
->i_row_satds
[0][0];
264 h
->opencl
.copies
[h
->opencl
.num_copies
].src
= locked
;
265 h
->opencl
.copies
[h
->opencl
.num_copies
].bytes
= size
;
266 h
->opencl
.num_copies
++;
268 size
= sizeof(int) * 4;
269 locked
= x264_opencl_alloc_locked( h
, size
);
270 OCLCHECK( clEnqueueReadBuffer
, h
->opencl
.queue
, h
->opencl
.frame_stats
[h
->opencl
.last_buf
], CL_FALSE
, 0, size
, locked
, 0, NULL
, NULL
);
271 h
->opencl
.copies
[h
->opencl
.num_copies
].dest
= &fenc
->i_cost_est
[0][0];
272 h
->opencl
.copies
[h
->opencl
.num_copies
].src
= locked
;
273 h
->opencl
.copies
[h
->opencl
.num_copies
].bytes
= sizeof(int);
274 h
->opencl
.num_copies
++;
275 h
->opencl
.copies
[h
->opencl
.num_copies
].dest
= &fenc
->i_cost_est_aq
[0][0];
276 h
->opencl
.copies
[h
->opencl
.num_copies
].src
= locked
+ sizeof(int);
277 h
->opencl
.copies
[h
->opencl
.num_copies
].bytes
= sizeof(int);
278 h
->opencl
.num_copies
++;
280 h
->opencl
.last_buf
= !h
->opencl
.last_buf
;
284 /* This function was tested emprically on a number of AMD and NV GPUs. Making a
285 * function which returns perfect launch dimensions is impossible; some
286 * applications will have self-tuning code to try many possible variables and
287 * measure the runtime. Here we simply make an educated guess based on what we
288 * know GPUs typically prefer. */
289 static void x264_optimal_launch_dims( x264_t
*h
, size_t *gdims
, size_t *ldims
, const cl_kernel kernel
, const cl_device_id device
)
291 x264_opencl_function_t
*ocl
= h
->opencl
.ocl
;
292 size_t max_work_group
= 256; /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */
293 size_t preferred_multiple
= 64;
296 ocl
->clGetKernelWorkGroupInfo( kernel
, device
, CL_KERNEL_WORK_GROUP_SIZE
, sizeof(size_t), &max_work_group
, NULL
);
297 ocl
->clGetKernelWorkGroupInfo( kernel
, device
, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
, sizeof(size_t), &preferred_multiple
, NULL
);
298 ocl
->clGetDeviceInfo( device
, CL_DEVICE_MAX_COMPUTE_UNITS
, sizeof(cl_uint
), &num_cus
, NULL
);
300 ldims
[0] = preferred_multiple
;
303 /* make ldims[1] an even divisor of gdims[1] */
304 while( gdims
[1] & (ldims
[1] - 1) )
309 /* make total ldims fit under the max work-group dimensions for the device */
310 while( ldims
[0] * ldims
[1] > max_work_group
)
312 if( (ldims
[0] <= preferred_multiple
) && (ldims
[1] > 1) )
318 if( ldims
[0] > gdims
[0] )
320 /* remove preferred multiples until we're close to gdims[0] */
321 while( gdims
[0] + preferred_multiple
< ldims
[0] )
322 ldims
[0] -= preferred_multiple
;
327 /* make gdims an even multiple of ldims */
328 gdims
[0] = (gdims
[0]+ldims
[0]-1)/ldims
[0];
329 gdims
[0] *= ldims
[0];
332 /* make ldims smaller to spread work across compute units */
333 while( (gdims
[0]/ldims
[0]) * (gdims
[1]/ldims
[1]) * 2 <= num_cus
)
335 if( ldims
[0] > preferred_multiple
)
337 else if( ldims
[1] > 1 )
342 /* for smaller GPUs, try not to abuse their texture cache */
343 if( num_cus
== 6 && ldims
[0] == 64 && ldims
[1] == 4 )
347 int x264_opencl_motionsearch( x264_t
*h
, x264_frame_t
**frames
, int b
, int ref
, int b_islist1
, int lambda
, const x264_weight_t
*w
)
349 x264_opencl_function_t
*ocl
= h
->opencl
.ocl
;
350 x264_frame_t
*fenc
= frames
[b
];
351 x264_frame_t
*fref
= frames
[ref
];
353 cl_mem ref_scaled_images
[NUM_IMAGE_SCALES
];
354 cl_mem ref_luma_hpel
;
357 if( w
&& w
->weightfn
)
361 gdims
[0] = 8 * h
->mb
.i_mb_width
;
362 gdims
[1] = 8 * h
->mb
.i_mb_height
;
364 /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */
365 for( int i
= 0; i
< NUM_IMAGE_SCALES
; i
++ )
368 OCLCHECK( clSetKernelArg
, h
->opencl
.weightp_scaled_images_kernel
, arg
++, sizeof(cl_mem
), &fref
->opencl
.scaled_image2Ds
[i
] );
369 OCLCHECK( clSetKernelArg
, h
->opencl
.weightp_scaled_images_kernel
, arg
++, sizeof(cl_mem
), &h
->opencl
.weighted_scaled_images
[i
] );
370 OCLCHECK( clSetKernelArg
, h
->opencl
.weightp_scaled_images_kernel
, arg
++, sizeof(int32_t), &w
->i_offset
);
371 OCLCHECK( clSetKernelArg
, h
->opencl
.weightp_scaled_images_kernel
, arg
++, sizeof(int32_t), &w
->i_scale
);
372 OCLCHECK( clSetKernelArg
, h
->opencl
.weightp_scaled_images_kernel
, arg
++, sizeof(int32_t), &w
->i_denom
);
373 OCLCHECK( clEnqueueNDRangeKernel
, h
->opencl
.queue
, h
->opencl
.weightp_scaled_images_kernel
, 2, NULL
, gdims
, NULL
, 0, NULL
, NULL
);
377 if( gdims
[0] < 16 || gdims
[1] < 16 )
382 gdims
[0] = 8 * h
->mb
.i_mb_width
;
383 gdims
[1] = 8 * h
->mb
.i_mb_height
;
385 OCLCHECK( clSetKernelArg
, h
->opencl
.weightp_hpel_kernel
, arg
++, sizeof(cl_mem
), &fref
->opencl
.luma_hpel
);
386 OCLCHECK( clSetKernelArg
, h
->opencl
.weightp_hpel_kernel
, arg
++, sizeof(cl_mem
), &h
->opencl
.weighted_luma_hpel
);
387 OCLCHECK( clSetKernelArg
, h
->opencl
.weightp_hpel_kernel
, arg
++, sizeof(int32_t), &w
->i_offset
);
388 OCLCHECK( clSetKernelArg
, h
->opencl
.weightp_hpel_kernel
, arg
++, sizeof(int32_t), &w
->i_scale
);
389 OCLCHECK( clSetKernelArg
, h
->opencl
.weightp_hpel_kernel
, arg
++, sizeof(int32_t), &w
->i_denom
);
390 OCLCHECK( clEnqueueNDRangeKernel
, h
->opencl
.queue
, h
->opencl
.weightp_hpel_kernel
, 2, NULL
, gdims
, NULL
, 0, NULL
, NULL
);
392 /* Use weighted reference planes for motion search */
393 for( int i
= 0; i
< NUM_IMAGE_SCALES
; i
++ )
394 ref_scaled_images
[i
] = h
->opencl
.weighted_scaled_images
[i
];
395 ref_luma_hpel
= h
->opencl
.weighted_luma_hpel
;
399 /* Use unweighted reference planes for motion search */
400 for( int i
= 0; i
< NUM_IMAGE_SCALES
; i
++ )
401 ref_scaled_images
[i
] = fref
->opencl
.scaled_image2Ds
[i
];
402 ref_luma_hpel
= fref
->opencl
.luma_hpel
;
405 const int num_iterations
[NUM_IMAGE_SCALES
] = { 1, 1, 2, 3 };
406 int b_first_iteration
= 1;
407 int b_reverse_references
= 1;
411 int mb_per_group
= 0;
412 int cost_local_size
= 0;
413 int mvc_local_size
= 0;
420 for( int scale
= NUM_IMAGE_SCALES
-1; scale
>= 0; scale
-- )
422 mb_width
= h
->mb
.i_mb_width
>> scale
;
424 gdims
[1] = h
->mb
.i_mb_height
>> scale
;
425 if( gdims
[0] < 2 || gdims
[1] < 2 )
428 x264_optimal_launch_dims( h
, gdims
, ldims
, h
->opencl
.hme_kernel
, h
->opencl
.device
);
430 mb_per_group
= (ldims
[0] >> 2) * ldims
[1];
431 cost_local_size
= 4 * mb_per_group
* sizeof(int16_t);
432 mvc_local_size
= 4 * mb_per_group
* sizeof(int16_t) * 2;
433 int scaled_me_range
= h
->param
.analyse
.i_me_range
>> scale
;
434 int b_shift_index
= 1;
437 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.scaled_image2Ds
[scale
] );
438 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, arg
++, sizeof(cl_mem
), &ref_scaled_images
[scale
] );
439 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, arg
++, sizeof(cl_mem
), &h
->opencl
.mv_buffers
[A
] );
440 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, arg
++, sizeof(cl_mem
), &h
->opencl
.mv_buffers
[!A
] );
441 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, arg
++, sizeof(cl_mem
), &h
->opencl
.lowres_mv_costs
);
442 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, arg
++, sizeof(cl_mem
), (void*)&h
->opencl
.mvp_buffer
);
443 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, arg
++, cost_local_size
, NULL
);
444 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, arg
++, mvc_local_size
, NULL
);
445 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, arg
++, sizeof(int), &mb_width
);
446 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, arg
++, sizeof(int), &lambda
);
447 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, arg
++, sizeof(int), &scaled_me_range
);
448 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, arg
++, sizeof(int), &scale
);
449 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, arg
++, sizeof(int), &b_shift_index
);
450 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, arg
++, sizeof(int), &b_first_iteration
);
451 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, arg
++, sizeof(int), &b_reverse_references
);
453 for( int iter
= 0; iter
< num_iterations
[scale
]; iter
++ )
455 OCLCHECK( clEnqueueNDRangeKernel
, h
->opencl
.queue
, h
->opencl
.hme_kernel
, 2, NULL
, gdims
, ldims
, 0, NULL
, NULL
);
458 b_first_iteration
= 0;
460 /* alternate top-left vs bot-right MB references at lower scales, so
461 * motion field smooths more quickly. */
463 b_reverse_references
^= 1;
465 b_reverse_references
= 0;
467 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, 2, sizeof(cl_mem
), &h
->opencl
.mv_buffers
[A
] );
468 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, 3, sizeof(cl_mem
), &h
->opencl
.mv_buffers
[!A
] );
469 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, arg
- 3, sizeof(int), &b_shift_index
);
470 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, arg
- 2, sizeof(int), &b_first_iteration
);
471 OCLCHECK( clSetKernelArg
, h
->opencl
.hme_kernel
, arg
- 1, sizeof(int), &b_reverse_references
);
475 int satd_local_size
= mb_per_group
* sizeof(uint32_t) * 16;
477 OCLCHECK( clSetKernelArg
, h
->opencl
.subpel_refine_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.scaled_image2Ds
[0] );
478 OCLCHECK( clSetKernelArg
, h
->opencl
.subpel_refine_kernel
, arg
++, sizeof(cl_mem
), &ref_luma_hpel
);
479 OCLCHECK( clSetKernelArg
, h
->opencl
.subpel_refine_kernel
, arg
++, sizeof(cl_mem
), &h
->opencl
.mv_buffers
[A
] );
480 OCLCHECK( clSetKernelArg
, h
->opencl
.subpel_refine_kernel
, arg
++, sizeof(cl_mem
), &h
->opencl
.lowres_mv_costs
);
481 OCLCHECK( clSetKernelArg
, h
->opencl
.subpel_refine_kernel
, arg
++, cost_local_size
, NULL
);
482 OCLCHECK( clSetKernelArg
, h
->opencl
.subpel_refine_kernel
, arg
++, satd_local_size
, NULL
);
483 OCLCHECK( clSetKernelArg
, h
->opencl
.subpel_refine_kernel
, arg
++, mvc_local_size
, NULL
);
487 OCLCHECK( clSetKernelArg
, h
->opencl
.subpel_refine_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.lowres_mvs1
);
488 OCLCHECK( clSetKernelArg
, h
->opencl
.subpel_refine_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.lowres_mv_costs1
);
492 OCLCHECK( clSetKernelArg
, h
->opencl
.subpel_refine_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.lowres_mvs0
);
493 OCLCHECK( clSetKernelArg
, h
->opencl
.subpel_refine_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.lowres_mv_costs0
);
496 OCLCHECK( clSetKernelArg
, h
->opencl
.subpel_refine_kernel
, arg
++, sizeof(int), &mb_width
);
497 OCLCHECK( clSetKernelArg
, h
->opencl
.subpel_refine_kernel
, arg
++, sizeof(int), &lambda
);
498 OCLCHECK( clSetKernelArg
, h
->opencl
.subpel_refine_kernel
, arg
++, sizeof(int), &b
);
499 OCLCHECK( clSetKernelArg
, h
->opencl
.subpel_refine_kernel
, arg
++, sizeof(int), &ref
);
500 OCLCHECK( clSetKernelArg
, h
->opencl
.subpel_refine_kernel
, arg
++, sizeof(int), &b_islist1
);
502 if( h
->opencl
.b_device_AMD_SI
)
504 /* workaround for AMD Southern Island driver scheduling bug (fixed in
505 * July 2012), perform meaningless small copy to add a data dependency */
506 OCLCHECK( clEnqueueCopyBuffer
, h
->opencl
.queue
, h
->opencl
.mv_buffers
[A
], h
->opencl
.mv_buffers
[!A
], 0, 0, 20, 0, NULL
, NULL
);
509 OCLCHECK( clEnqueueNDRangeKernel
, h
->opencl
.queue
, h
->opencl
.subpel_refine_kernel
, 2, NULL
, gdims
, ldims
, 0, NULL
, NULL
);
511 int mvlen
= 2 * sizeof(int16_t) * h
->mb
.i_mb_count
;
513 if( h
->opencl
.num_copies
>= MAX_FINISH_COPIES
- 1 )
514 x264_opencl_flush( h
);
516 char *locked
= x264_opencl_alloc_locked( h
, mvlen
);
517 h
->opencl
.copies
[h
->opencl
.num_copies
].src
= locked
;
518 h
->opencl
.copies
[h
->opencl
.num_copies
].bytes
= mvlen
;
522 int mvs_offset
= mvlen
* (ref
- b
- 1);
523 OCLCHECK( clEnqueueReadBuffer
, h
->opencl
.queue
, fenc
->opencl
.lowres_mvs1
, CL_FALSE
, mvs_offset
, mvlen
, locked
, 0, NULL
, NULL
);
524 h
->opencl
.copies
[h
->opencl
.num_copies
].dest
= fenc
->lowres_mvs
[1][ref
- b
- 1];
528 int mvs_offset
= mvlen
* (b
- ref
- 1);
529 OCLCHECK( clEnqueueReadBuffer
, h
->opencl
.queue
, fenc
->opencl
.lowres_mvs0
, CL_FALSE
, mvs_offset
, mvlen
, locked
, 0, NULL
, NULL
);
530 h
->opencl
.copies
[h
->opencl
.num_copies
].dest
= fenc
->lowres_mvs
[0][b
- ref
- 1];
533 h
->opencl
.num_copies
++;
538 int x264_opencl_finalize_cost( x264_t
*h
, int lambda
, x264_frame_t
**frames
, int p0
, int p1
, int b
, int dist_scale_factor
)
540 x264_opencl_function_t
*ocl
= h
->opencl
.ocl
;
542 x264_frame_t
*fenc
= frames
[b
];
543 x264_frame_t
*fref0
= frames
[p0
];
544 x264_frame_t
*fref1
= frames
[p1
];
546 int bipred_weight
= h
->param
.analyse
.b_weighted_bipred
? 64 - (dist_scale_factor
>> 2) : 32;
548 /* Tasks for this kernel:
549 * 1. Select least cost mode (intra, ref0, ref1)
550 * list_used 0, 1, 2, or 3. if B frame, do not allow intra
551 * 2. if B frame, try bidir predictions.
552 * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); */
553 size_t gdims
[2] = { h
->mb
.i_mb_width
, h
->mb
.i_mb_height
};
554 size_t ldim_bidir
[2];
555 size_t *ldims
= NULL
;
556 int cost_local_size
= 4;
557 int satd_local_size
= 4;
560 /* For B frames, use 4 threads per MB for BIDIR checks */
563 x264_optimal_launch_dims( h
, gdims
, ldims
, h
->opencl
.mode_select_kernel
, h
->opencl
.device
);
564 int mb_per_group
= (ldims
[0] >> 2) * ldims
[1];
565 cost_local_size
= 4 * mb_per_group
* sizeof(int16_t);
566 satd_local_size
= 16 * mb_per_group
* sizeof(uint32_t);
570 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.scaled_image2Ds
[0] );
571 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, sizeof(cl_mem
), &fref0
->opencl
.luma_hpel
);
572 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, sizeof(cl_mem
), &fref1
->opencl
.luma_hpel
);
573 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.lowres_mvs0
);
574 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.lowres_mvs1
);
575 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, sizeof(cl_mem
), &fref1
->opencl
.lowres_mvs0
);
576 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.lowres_mv_costs0
);
577 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.lowres_mv_costs1
);
578 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.intra_cost
);
579 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, sizeof(cl_mem
), &h
->opencl
.lowres_costs
[h
->opencl
.last_buf
] );
580 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, sizeof(cl_mem
), &h
->opencl
.frame_stats
[h
->opencl
.last_buf
] );
581 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, cost_local_size
, NULL
);
582 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, satd_local_size
, NULL
);
583 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, sizeof(int), &h
->mb
.i_mb_width
);
584 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, sizeof(int), &bipred_weight
);
585 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, sizeof(int), &dist_scale_factor
);
586 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, sizeof(int), &b
);
587 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, sizeof(int), &p0
);
588 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, sizeof(int), &p1
);
589 OCLCHECK( clSetKernelArg
, h
->opencl
.mode_select_kernel
, arg
++, sizeof(int), &lambda
);
590 OCLCHECK( clEnqueueNDRangeKernel
, h
->opencl
.queue
, h
->opencl
.mode_select_kernel
, 2, NULL
, gdims
, ldims
, 0, NULL
, NULL
);
592 /* Sum costs across rows, atomicAdd down frame */
593 size_t gdim
[2] = { 256, h
->mb
.i_mb_height
};
594 size_t ldim
[2] = { 256, 1 };
597 OCLCHECK( clSetKernelArg
, h
->opencl
.rowsum_inter_kernel
, arg
++, sizeof(cl_mem
), &h
->opencl
.lowres_costs
[h
->opencl
.last_buf
] );
598 OCLCHECK( clSetKernelArg
, h
->opencl
.rowsum_inter_kernel
, arg
++, sizeof(cl_mem
), &fenc
->opencl
.inv_qscale_factor
);
599 OCLCHECK( clSetKernelArg
, h
->opencl
.rowsum_inter_kernel
, arg
++, sizeof(cl_mem
), &h
->opencl
.row_satds
[h
->opencl
.last_buf
] );
600 OCLCHECK( clSetKernelArg
, h
->opencl
.rowsum_inter_kernel
, arg
++, sizeof(cl_mem
), &h
->opencl
.frame_stats
[h
->opencl
.last_buf
] );
601 OCLCHECK( clSetKernelArg
, h
->opencl
.rowsum_inter_kernel
, arg
++, sizeof(int), &h
->mb
.i_mb_width
);
602 OCLCHECK( clSetKernelArg
, h
->opencl
.rowsum_inter_kernel
, arg
++, sizeof(int), &h
->param
.i_bframe_bias
);
603 OCLCHECK( clSetKernelArg
, h
->opencl
.rowsum_inter_kernel
, arg
++, sizeof(int), &b
);
604 OCLCHECK( clSetKernelArg
, h
->opencl
.rowsum_inter_kernel
, arg
++, sizeof(int), &p0
);
605 OCLCHECK( clSetKernelArg
, h
->opencl
.rowsum_inter_kernel
, arg
++, sizeof(int), &p1
);
606 OCLCHECK( clEnqueueNDRangeKernel
, h
->opencl
.queue
, h
->opencl
.rowsum_inter_kernel
, 2, NULL
, gdim
, ldim
, 0, NULL
, NULL
);
608 if( h
->opencl
.num_copies
>= MAX_FINISH_COPIES
- 4 )
609 x264_opencl_flush( h
);
611 int size
= h
->mb
.i_mb_count
* sizeof(int16_t);
612 char *locked
= x264_opencl_alloc_locked( h
, size
);
613 h
->opencl
.copies
[h
->opencl
.num_copies
].src
= locked
;
614 h
->opencl
.copies
[h
->opencl
.num_copies
].dest
= fenc
->lowres_costs
[b
- p0
][p1
- b
];
615 h
->opencl
.copies
[h
->opencl
.num_copies
].bytes
= size
;
616 OCLCHECK( clEnqueueReadBuffer
, h
->opencl
.queue
, h
->opencl
.lowres_costs
[h
->opencl
.last_buf
], CL_FALSE
, 0, size
, locked
, 0, NULL
, NULL
);
617 h
->opencl
.num_copies
++;
619 size
= h
->mb
.i_mb_height
* sizeof(int);
620 locked
= x264_opencl_alloc_locked( h
, size
);
621 h
->opencl
.copies
[h
->opencl
.num_copies
].src
= locked
;
622 h
->opencl
.copies
[h
->opencl
.num_copies
].dest
= fenc
->i_row_satds
[b
- p0
][p1
- b
];
623 h
->opencl
.copies
[h
->opencl
.num_copies
].bytes
= size
;
624 OCLCHECK( clEnqueueReadBuffer
, h
->opencl
.queue
, h
->opencl
.row_satds
[h
->opencl
.last_buf
], CL_FALSE
, 0, size
, locked
, 0, NULL
, NULL
);
625 h
->opencl
.num_copies
++;
627 size
= 4 * sizeof(int);
628 locked
= x264_opencl_alloc_locked( h
, size
);
629 OCLCHECK( clEnqueueReadBuffer
, h
->opencl
.queue
, h
->opencl
.frame_stats
[h
->opencl
.last_buf
], CL_FALSE
, 0, size
, locked
, 0, NULL
, NULL
);
630 h
->opencl
.last_buf
= !h
->opencl
.last_buf
;
632 h
->opencl
.copies
[h
->opencl
.num_copies
].src
= locked
;
633 h
->opencl
.copies
[h
->opencl
.num_copies
].dest
= &fenc
->i_cost_est
[b
- p0
][p1
- b
];
634 h
->opencl
.copies
[h
->opencl
.num_copies
].bytes
= sizeof(int);
635 h
->opencl
.num_copies
++;
636 h
->opencl
.copies
[h
->opencl
.num_copies
].src
= locked
+ sizeof(int);
637 h
->opencl
.copies
[h
->opencl
.num_copies
].dest
= &fenc
->i_cost_est_aq
[b
- p0
][p1
- b
];
638 h
->opencl
.copies
[h
->opencl
.num_copies
].bytes
= sizeof(int);
639 h
->opencl
.num_copies
++;
641 if( b
== p1
) // P frames only
643 h
->opencl
.copies
[h
->opencl
.num_copies
].src
= locked
+ 2 * sizeof(int);
644 h
->opencl
.copies
[h
->opencl
.num_copies
].dest
= &fenc
->i_intra_mbs
[b
- p0
];
645 h
->opencl
.copies
[h
->opencl
.num_copies
].bytes
= sizeof(int);
646 h
->opencl
.num_copies
++;
651 void x264_opencl_slicetype_prep( x264_t
*h
, x264_frame_t
**frames
, int num_frames
, int lambda
)
653 if( h
->param
.b_opencl
)
656 /* Temporarily boost priority of this lookahead thread and the OpenCL
657 * driver's thread until the end of this function. On AMD GPUs this
658 * greatly reduces the latency of enqueuing kernels and getting results
660 HANDLE id
= GetCurrentThread();
661 h
->opencl
.lookahead_thread_pri
= GetThreadPriority( id
);
662 SetThreadPriority( id
, THREAD_PRIORITY_ABOVE_NORMAL
);
663 x264_opencl_function_t
*ocl
= h
->opencl
.ocl
;
664 cl_int status
= ocl
->clGetCommandQueueInfo( h
->opencl
.queue
, CL_QUEUE_THREAD_HANDLE_AMD
, sizeof(HANDLE
), &id
, NULL
);
665 if( status
== CL_SUCCESS
)
667 h
->opencl
.opencl_thread_pri
= GetThreadPriority( id
);
668 SetThreadPriority( id
, THREAD_PRIORITY_ABOVE_NORMAL
);
672 /* precalculate intra and I frames */
673 for( int i
= 0; i
<= num_frames
; i
++ )
674 x264_opencl_lowres_init( h
, frames
[i
], lambda
);
675 x264_opencl_flush( h
);
677 if( h
->param
.i_bframe_adaptive
== X264_B_ADAPT_TRELLIS
&& h
->param
.i_bframe
)
679 /* For trellis B-Adapt, precompute exhaustive motion searches */
680 for( int b
= 0; b
<= num_frames
; b
++ )
682 for( int j
= 1; j
< h
->param
.i_bframe
; j
++ )
685 if( p0
>= 0 && frames
[b
]->lowres_mvs
[0][b
-p0
-1][0][0] == 0x7FFF )
687 const x264_weight_t
*w
= x264_weight_none
;
689 if( h
->param
.analyse
.i_weighted_pred
)
692 x264_weights_analyse( h
, frames
[b
], frames
[p0
], 1 );
693 w
= frames
[b
]->weight
[0];
695 frames
[b
]->lowres_mvs
[0][b
-p0
-1][0][0] = 0;
696 x264_opencl_motionsearch( h
, frames
, b
, p0
, 0, lambda
, w
);
699 if( p1
<= num_frames
&& frames
[b
]->lowres_mvs
[1][p1
-b
-1][0][0] == 0x7FFF )
701 frames
[b
]->lowres_mvs
[1][p1
-b
-1][0][0] = 0;
702 x264_opencl_motionsearch( h
, frames
, b
, p1
, 1, lambda
, NULL
);
707 x264_opencl_flush( h
);
713 void x264_opencl_slicetype_end( x264_t
*h
)
716 if( h
->param
.b_opencl
)
718 HANDLE id
= GetCurrentThread();
719 SetThreadPriority( id
, h
->opencl
.lookahead_thread_pri
);
720 x264_opencl_function_t
*ocl
= h
->opencl
.ocl
;
721 cl_int status
= ocl
->clGetCommandQueueInfo( h
->opencl
.queue
, CL_QUEUE_THREAD_HANDLE_AMD
, sizeof(HANDLE
), &id
, NULL
);
722 if( status
== CL_SUCCESS
)
723 SetThreadPriority( id
, h
->opencl
.opencl_thread_pri
);
728 int x264_opencl_precalculate_frame_cost( x264_t
*h
, x264_frame_t
**frames
, int lambda
, int p0
, int p1
, int b
)
730 if( (frames
[b
]->i_cost_est
[b
-p0
][p1
-b
] >= 0) || (b
== p0
&& b
== p1
) )
735 int dist_scale_factor
= 128;
736 const x264_weight_t
*w
= x264_weight_none
;
738 // avoid duplicating work
739 frames
[b
]->i_cost_est
[b
-p0
][p1
-b
] = 0;
741 do_search
[0] = b
!= p0
&& frames
[b
]->lowres_mvs
[0][b
-p0
-1][0][0] == 0x7FFF;
742 do_search
[1] = b
!= p1
&& frames
[b
]->lowres_mvs
[1][p1
-b
-1][0][0] == 0x7FFF;
745 if( h
->param
.analyse
.i_weighted_pred
&& b
== p1
)
748 x264_weights_analyse( h
, frames
[b
], frames
[p0
], 1 );
749 w
= frames
[b
]->weight
[0];
751 frames
[b
]->lowres_mvs
[0][b
-p0
-1][0][0] = 0;
754 frames
[b
]->lowres_mvs
[1][p1
-b
-1][0][0] = 0;
756 frames
[b
]->i_intra_mbs
[b
-p0
] = 0;
758 dist_scale_factor
= ( ((b
-p0
) << 8) + ((p1
-p0
) >> 1) ) / (p1
-p0
);
760 frames
[b
]->i_cost_est
[b
-p0
][p1
-b
] = 0;
761 frames
[b
]->i_cost_est_aq
[b
-p0
][p1
-b
] = 0;
763 x264_opencl_lowres_init( h
, frames
[b
], lambda
);
767 x264_opencl_lowres_init( h
, frames
[p0
], lambda
);
768 x264_opencl_motionsearch( h
, frames
, b
, p0
, 0, lambda
, w
);
772 x264_opencl_lowres_init( h
, frames
[p1
], lambda
);
773 x264_opencl_motionsearch( h
, frames
, b
, p1
, 1, lambda
, NULL
);
775 x264_opencl_finalize_cost( h
, lambda
, frames
, p0
, p1
, b
, dist_scale_factor
);