avformat/mxfdec: Check edit unit for overflow in mxf_set_current_edit_unit()
[FFMpeg-mirror.git] / libavfilter / vf_nlmeans_vulkan.c
blob67360dc745373f811ee19067e4c35eef8235e9b1
1 /*
2 * Copyright (c) Lynne
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mem.h"
22 #include "libavutil/random_seed.h"
23 #include "libavutil/vulkan_spirv.h"
24 #include "libavutil/opt.h"
25 #include "vulkan_filter.h"
27 #include "filters.h"
28 #include "video.h"
30 #define TYPE_NAME "vec4"
31 #define TYPE_ELEMS 4
32 #define TYPE_SIZE (TYPE_ELEMS*4)
34 typedef struct NLMeansVulkanContext {
35 FFVulkanContext vkctx;
37 int initialized;
38 FFVkExecPool e;
39 AVVulkanDeviceQueueFamily *qf;
40 VkSampler sampler;
42 AVBufferPool *integral_buf_pool;
43 AVBufferPool *ws_buf_pool;
45 FFVkBuffer xyoffsets_buf;
47 int pl_weights_rows;
48 FFVulkanShader shd_weights;
49 FFVulkanShader shd_denoise;
51 int *xoffsets;
52 int *yoffsets;
53 int nb_offsets;
54 float strength[4];
55 int patch[4];
57 struct nlmeans_opts {
58 int r;
59 double s;
60 double sc[4];
61 int p;
62 int pc[4];
63 int t;
64 } opts;
65 } NLMeansVulkanContext;
67 static void insert_first(FFVulkanShader *shd, int r, const char *off, int horiz, int plane, int comp)
69 GLSLF(4, s1 = texture(input_img[%i], pos + ivec2(%i + %s, %i + %s))[%i];
70 ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
72 GLSLF(4, s2[0] = texture(input_img[%i], pos + offs[0] + ivec2(%i + %s, %i + %s))[%i];
73 ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
74 GLSLF(4, s2[1] = texture(input_img[%i], pos + offs[1] + ivec2(%i + %s, %i + %s))[%i];
75 ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
76 GLSLF(4, s2[2] = texture(input_img[%i], pos + offs[2] + ivec2(%i + %s, %i + %s))[%i];
77 ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
78 GLSLF(4, s2[3] = texture(input_img[%i], pos + offs[3] + ivec2(%i + %s, %i + %s))[%i];
79 ,plane, horiz ? r : 0, horiz ? off : "0", !horiz ? r : 0, !horiz ? off : "0", comp);
81 GLSLC(4, s2 = (s1 - s2) * (s1 - s2); );
84 static void insert_horizontal_pass(FFVulkanShader *shd, int nb_rows, int first, int plane, int comp)
86 GLSLF(1, pos.y = int(gl_GlobalInvocationID.x) * %i; ,nb_rows);
87 if (!first)
88 GLSLC(1, barrier(); );
89 GLSLC(0, );
90 GLSLF(1, if (pos.y < height[%i]) { ,plane);
91 GLSLC(2, #pragma unroll(1) );
92 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
93 GLSLC(3, prefix_sum = DTYPE(0); );
94 GLSLC(3, offset = int_stride * uint64_t(pos.y + r); );
95 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
96 GLSLC(0, );
97 GLSLF(3, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane);
98 if (first)
99 insert_first(shd, 0, "r", 0, plane, comp);
100 else
101 GLSLC(4, s2 = dst.v[pos.x]; );
102 GLSLC(4, dst.v[pos.x] = s2 + prefix_sum; );
103 GLSLC(4, prefix_sum += s2; );
104 GLSLC(3, } );
105 GLSLC(2, } );
106 GLSLC(1, } );
107 GLSLC(0, );
110 static void insert_vertical_pass(FFVulkanShader *shd, int nb_rows, int first, int plane, int comp)
112 GLSLF(1, pos.x = int(gl_GlobalInvocationID.x) * %i; ,nb_rows);
113 GLSLC(1, #pragma unroll(1) );
114 GLSLF(1, for (r = 0; r < %i; r++) ,nb_rows);
115 GLSLC(2, psum[r] = DTYPE(0); );
116 GLSLC(0, );
117 if (!first)
118 GLSLC(1, barrier(); );
119 GLSLC(0, );
120 GLSLF(1, if (pos.x < width[%i]) { ,plane);
121 GLSLF(2, for (pos.y = 0; pos.y < height[%i]; pos.y++) { ,plane);
122 GLSLC(3, offset = int_stride * uint64_t(pos.y); );
123 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
124 GLSLC(0, );
125 GLSLC(3, #pragma unroll(1) );
126 GLSLF(3, for (r = 0; r < %i; r++) { ,nb_rows);
127 if (first)
128 insert_first(shd, 0, "r", 1, plane, comp);
129 else
130 GLSLC(4, s2 = dst.v[pos.x + r]; );
131 GLSLC(4, dst.v[pos.x + r] = s2 + psum[r]; );
132 GLSLC(4, psum[r] += s2; );
133 GLSLC(3, } );
134 GLSLC(2, } );
135 GLSLC(1, } );
136 GLSLC(0, );
139 static void insert_weights_pass(FFVulkanShader *shd, int nb_rows, int vert,
140 int t, int dst_comp, int plane, int comp)
142 GLSLF(1, p = patch_size[%i]; ,dst_comp);
143 GLSLC(0, );
144 GLSLC(1, barrier(); );
145 GLSLC(0, );
146 if (!vert) {
147 GLSLF(1, for (pos.y = 0; pos.y < height[%i]; pos.y++) { ,plane);
148 GLSLF(2, if (gl_GlobalInvocationID.x*%i >= width[%i]) ,nb_rows, plane);
149 GLSLC(3, break; );
150 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
151 GLSLF(3, pos.x = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
152 } else {
153 GLSLF(1, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane);
154 GLSLF(2, if (gl_GlobalInvocationID.x*%i >= height[%i]) ,nb_rows, plane);
155 GLSLC(3, break; );
156 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
157 GLSLF(3, pos.y = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
159 GLSLC(0, );
160 GLSLC(3, a = DTYPE(0); );
161 GLSLC(3, b = DTYPE(0); );
162 GLSLC(3, c = DTYPE(0); );
163 GLSLC(3, d = DTYPE(0); );
164 GLSLC(0, );
165 GLSLC(3, lt = ((pos.x - p) < 0) || ((pos.y - p) < 0); );
166 GLSLC(0, );
167 GLSLF(3, src[0] = texture(input_img[%i], pos + offs[0])[%i]; ,plane, comp);
168 GLSLF(3, src[1] = texture(input_img[%i], pos + offs[1])[%i]; ,plane, comp);
169 GLSLF(3, src[2] = texture(input_img[%i], pos + offs[2])[%i]; ,plane, comp);
170 GLSLF(3, src[3] = texture(input_img[%i], pos + offs[3])[%i]; ,plane, comp);
171 GLSLC(0, );
172 GLSLC(3, if (lt == false) { );
173 GLSLC(3, offset = int_stride * uint64_t(pos.y - p); );
174 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
175 GLSLC(4, a = dst.v[pos.x - p]; );
176 GLSLC(4, c = dst.v[pos.x + p]; );
177 GLSLC(3, offset = int_stride * uint64_t(pos.y + p); );
178 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
179 GLSLC(4, b = dst.v[pos.x - p]; );
180 GLSLC(4, d = dst.v[pos.x + p]; );
181 GLSLC(3, } );
182 GLSLC(0, );
183 GLSLC(3, patch_diff = d + a - b - c; );
184 GLSLF(3, w = exp(patch_diff * strength[%i]); ,dst_comp);
185 GLSLC(3, w_sum = w[0] + w[1] + w[2] + w[3]; );
186 GLSLC(3, sum = dot(w, src*255); );
187 GLSLC(0, );
188 if (t > 1) {
189 GLSLF(3, atomicAdd(weights_%i[pos.y*ws_stride[%i] + pos.x], w_sum); ,dst_comp, dst_comp);
190 GLSLF(3, atomicAdd(sums_%i[pos.y*ws_stride[%i] + pos.x], sum); ,dst_comp, dst_comp);
191 } else {
192 GLSLF(3, weights_%i[pos.y*ws_stride[%i] + pos.x] += w_sum; ,dst_comp, dst_comp);
193 GLSLF(3, sums_%i[pos.y*ws_stride[%i] + pos.x] += sum; ,dst_comp, dst_comp);
195 GLSLC(2, } );
196 GLSLC(1, } );
199 typedef struct HorizontalPushData {
200 uint32_t width[4];
201 uint32_t height[4];
202 uint32_t ws_stride[4];
203 int32_t patch_size[4];
204 float strength[4];
205 VkDeviceAddress integral_base;
206 uint64_t integral_size;
207 uint64_t int_stride;
208 uint32_t xyoffs_start;
209 } HorizontalPushData;
211 static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *exec,
212 FFVulkanShader *shd,
213 VkSampler sampler, FFVkSPIRVCompiler *spv,
214 int width, int height, int t,
215 const AVPixFmtDescriptor *desc,
216 int planes, int *nb_rows)
218 int err;
219 uint8_t *spv_data;
220 size_t spv_len;
221 void *spv_opaque = NULL;
222 FFVulkanDescriptorSetBinding *desc_set;
223 int max_dim = FFMAX(width, height);
224 uint32_t max_wg = vkctx->props.properties.limits.maxComputeWorkGroupSize[0];
225 int wg_size, wg_rows;
227 /* Round the max workgroup size to the previous power of two */
228 wg_size = max_wg;
229 wg_rows = 1;
231 if (max_wg > max_dim) {
232 wg_size = max_dim;
233 } else if (max_wg < max_dim) {
234 /* Make it fit */
235 while (wg_size*wg_rows < max_dim)
236 wg_rows++;
239 RET(ff_vk_shader_init(vkctx, shd, "nlmeans_weights",
240 VK_SHADER_STAGE_COMPUTE_BIT,
241 (const char *[]) { "GL_EXT_buffer_reference",
242 "GL_EXT_buffer_reference2" }, 2,
243 wg_size, 1, 1,
244 0));
246 *nb_rows = wg_rows;
248 if (t > 1)
249 GLSLC(0, #extension GL_EXT_shader_atomic_float : require );
250 GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require );
251 GLSLC(0, );
252 GLSLF(0, #define DTYPE %s ,TYPE_NAME);
253 GLSLF(0, #define T_ALIGN %i ,TYPE_SIZE);
254 GLSLC(0, );
255 GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) buffer DataBuffer { );
256 GLSLC(1, DTYPE v[]; );
257 GLSLC(0, }; );
258 GLSLC(0, );
259 GLSLC(0, layout(push_constant, std430) uniform pushConstants { );
260 GLSLC(1, uvec4 width; );
261 GLSLC(1, uvec4 height; );
262 GLSLC(1, uvec4 ws_stride; );
263 GLSLC(1, ivec4 patch_size; );
264 GLSLC(1, vec4 strength; );
265 GLSLC(1, DataBuffer integral_base; );
266 GLSLC(1, uint64_t integral_size; );
267 GLSLC(1, uint64_t int_stride; );
268 GLSLC(1, uint xyoffs_start; );
269 GLSLC(0, }; );
270 GLSLC(0, );
272 ff_vk_shader_add_push_const(shd, 0, sizeof(HorizontalPushData),
273 VK_SHADER_STAGE_COMPUTE_BIT);
275 desc_set = (FFVulkanDescriptorSetBinding []) {
277 .name = "input_img",
278 .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
279 .dimensions = 2,
280 .elems = planes,
281 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
282 .samplers = DUP_SAMPLER(sampler),
285 .name = "weights_buffer_0",
286 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
287 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
288 .buf_content = "float weights_0[];",
291 .name = "sums_buffer_0",
292 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
293 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
294 .buf_content = "float sums_0[];",
297 .name = "weights_buffer_1",
298 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
299 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
300 .buf_content = "float weights_1[];",
303 .name = "sums_buffer_1",
304 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
305 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
306 .buf_content = "float sums_1[];",
309 .name = "weights_buffer_2",
310 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
311 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
312 .buf_content = "float weights_2[];",
315 .name = "sums_buffer_2",
316 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
317 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
318 .buf_content = "float sums_2[];",
321 .name = "weights_buffer_3",
322 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
323 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
324 .buf_content = "float weights_3[];",
327 .name = "sums_buffer_3",
328 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
329 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
330 .buf_content = "float sums_3[];",
333 RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 1 + 2*desc->nb_components, 0, 0));
335 desc_set = (FFVulkanDescriptorSetBinding []) {
337 .name = "xyoffsets_buffer",
338 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
339 .mem_quali = "readonly",
340 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
341 .buf_content = "ivec2 xyoffsets[];",
344 RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 1, 1, 0));
346 GLSLC(0, );
347 GLSLC(0, void main() );
348 GLSLC(0, { );
349 GLSLC(1, uint64_t offset; );
350 GLSLC(1, DataBuffer dst; );
351 GLSLC(1, float s1; );
352 GLSLC(1, DTYPE s2; );
353 GLSLC(1, DTYPE prefix_sum; );
354 GLSLF(1, DTYPE psum[%i]; ,*nb_rows);
355 GLSLC(1, int r; );
356 GLSLC(1, ivec2 pos; );
357 GLSLC(1, int p; );
358 GLSLC(0, );
359 GLSLC(1, DataBuffer integral_data; );
360 GLSLF(1, ivec2 offs[%i]; ,TYPE_ELEMS);
361 GLSLC(0, );
362 GLSLC(1, int invoc_idx = int(gl_WorkGroupID.z); );
363 GLSLC(0, );
364 GLSLC(1, offset = integral_size * invoc_idx; );
365 GLSLC(1, integral_data = DataBuffer(uint64_t(integral_base) + offset); );
366 for (int i = 0; i < TYPE_ELEMS; i++)
367 GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + %i*invoc_idx + %i]; ,i,TYPE_ELEMS,i);
368 GLSLC(0, );
369 GLSLC(1, DTYPE a; );
370 GLSLC(1, DTYPE b; );
371 GLSLC(1, DTYPE c; );
372 GLSLC(1, DTYPE d; );
373 GLSLC(0, );
374 GLSLC(1, DTYPE patch_diff; );
375 if (TYPE_ELEMS == 4) {
376 GLSLC(1, vec4 src; );
377 GLSLC(1, vec4 w; );
378 } else {
379 GLSLC(1, vec4 src[4]; );
380 GLSLC(1, vec4 w[4]; );
382 GLSLC(1, float w_sum; );
383 GLSLC(1, float sum; );
384 GLSLC(0, );
385 GLSLC(1, bool lt; );
386 GLSLC(1, bool gt; );
387 GLSLC(0, );
389 for (int i = 0; i < desc->nb_components; i++) {
390 int off = desc->comp[i].offset / (FFALIGN(desc->comp[i].depth, 8)/8);
391 if (width >= height) {
392 insert_horizontal_pass(shd, *nb_rows, 1, desc->comp[i].plane, off);
393 insert_vertical_pass(shd, *nb_rows, 0, desc->comp[i].plane, off);
394 insert_weights_pass(shd, *nb_rows, 0, t, i, desc->comp[i].plane, off);
395 } else {
396 insert_vertical_pass(shd, *nb_rows, 1, desc->comp[i].plane, off);
397 insert_horizontal_pass(shd, *nb_rows, 0, desc->comp[i].plane, off);
398 insert_weights_pass(shd, *nb_rows, 1, t, i, desc->comp[i].plane, off);
402 GLSLC(0, } );
404 RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
405 RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
407 RET(ff_vk_shader_register_exec(vkctx, exec, shd));
409 fail:
410 if (spv_opaque)
411 spv->free_shader(spv, &spv_opaque);
413 return err;
416 typedef struct DenoisePushData {
417 uint32_t ws_stride[4];
418 } DenoisePushData;
420 static av_cold int init_denoise_pipeline(FFVulkanContext *vkctx, FFVkExecPool *exec,
421 FFVulkanShader *shd,
422 VkSampler sampler, FFVkSPIRVCompiler *spv,
423 const AVPixFmtDescriptor *desc, int planes)
425 int err;
426 uint8_t *spv_data;
427 size_t spv_len;
428 void *spv_opaque = NULL;
429 FFVulkanDescriptorSetBinding *desc_set;
431 RET(ff_vk_shader_init(vkctx, shd, "nlmeans_denoise",
432 VK_SHADER_STAGE_COMPUTE_BIT,
433 (const char *[]) { "GL_EXT_buffer_reference",
434 "GL_EXT_buffer_reference2" }, 2,
435 32, 32, 1,
436 0));
438 GLSLC(0, layout(push_constant, std430) uniform pushConstants { );
439 GLSLC(1, uvec4 ws_stride; );
440 GLSLC(0, }; );
442 ff_vk_shader_add_push_const(shd, 0, sizeof(DenoisePushData),
443 VK_SHADER_STAGE_COMPUTE_BIT);
445 desc_set = (FFVulkanDescriptorSetBinding []) {
447 .name = "input_img",
448 .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
449 .dimensions = 2,
450 .elems = planes,
451 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
452 .samplers = DUP_SAMPLER(sampler),
455 .name = "output_img",
456 .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
457 .mem_layout = ff_vk_shader_rep_fmt(vkctx->output_format, FF_VK_REP_FLOAT),
458 .mem_quali = "writeonly",
459 .dimensions = 2,
460 .elems = planes,
461 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
464 RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 2, 0, 0));
466 desc_set = (FFVulkanDescriptorSetBinding []) {
468 .name = "weights_buffer_0",
469 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
470 .mem_quali = "readonly",
471 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
472 .buf_content = "float weights_0[];",
475 .name = "sums_buffer_0",
476 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
477 .mem_quali = "readonly",
478 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
479 .buf_content = "float sums_0[];",
482 .name = "weights_buffer_1",
483 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
484 .mem_quali = "readonly",
485 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
486 .buf_content = "float weights_1[];",
489 .name = "sums_buffer_1",
490 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
491 .mem_quali = "readonly",
492 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
493 .buf_content = "float sums_1[];",
496 .name = "weights_buffer_2",
497 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
498 .mem_quali = "readonly",
499 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
500 .buf_content = "float weights_2[];",
503 .name = "sums_buffer_2",
504 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
505 .mem_quali = "readonly",
506 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
507 .buf_content = "float sums_2[];",
510 .name = "weights_buffer_3",
511 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
512 .mem_quali = "readonly",
513 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
514 .buf_content = "float weights_3[];",
517 .name = "sums_buffer_3",
518 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
519 .mem_quali = "readonly",
520 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
521 .buf_content = "float sums_3[];",
525 RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 2*desc->nb_components, 0, 0));
527 GLSLC(0, void main() );
528 GLSLC(0, { );
529 GLSLC(1, ivec2 size; );
530 GLSLC(1, const ivec2 pos = ivec2(gl_GlobalInvocationID.xy); );
531 GLSLC(1, const uint plane = uint(gl_WorkGroupID.z); );
532 GLSLC(0, );
533 GLSLC(1, float w_sum; );
534 GLSLC(1, float sum; );
535 GLSLC(1, vec4 src; );
536 GLSLC(1, vec4 r; );
537 GLSLC(0, );
538 GLSLC(1, size = imageSize(output_img[plane]); );
539 GLSLC(1, if (!IS_WITHIN(pos, size)) );
540 GLSLC(2, return; );
541 GLSLC(0, );
542 GLSLC(1, src = texture(input_img[plane], pos); );
543 GLSLC(0, );
544 for (int c = 0; c < desc->nb_components; c++) {
545 int off = desc->comp[c].offset / (FFALIGN(desc->comp[c].depth, 8)/8);
546 GLSLF(1, if (plane == %i) { ,desc->comp[c].plane);
547 GLSLF(2, w_sum = weights_%i[pos.y*ws_stride[%i] + pos.x]; ,c, c);
548 GLSLF(2, sum = sums_%i[pos.y*ws_stride[%i] + pos.x]; ,c, c);
549 GLSLF(2, r[%i] = (sum + src[%i]*255) / (1.0 + w_sum) / 255; ,off, off);
550 GLSLC(1, } );
551 GLSLC(0, );
553 GLSLC(1, imageStore(output_img[plane], pos, r); );
554 GLSLC(0, } );
556 RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
557 RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
559 RET(ff_vk_shader_register_exec(vkctx, exec, shd));
561 fail:
562 if (spv_opaque)
563 spv->free_shader(spv, &spv_opaque);
565 return err;
568 static av_cold int init_filter(AVFilterContext *ctx)
570 int rad, err;
571 int xcnt = 0, ycnt = 0;
572 NLMeansVulkanContext *s = ctx->priv;
573 FFVulkanContext *vkctx = &s->vkctx;
574 const int planes = av_pix_fmt_count_planes(s->vkctx.output_format);
575 FFVkSPIRVCompiler *spv = NULL;
576 int *offsets_buf;
577 int offsets_dispatched = 0, nb_dispatches = 0;
579 const AVPixFmtDescriptor *desc;
580 desc = av_pix_fmt_desc_get(vkctx->output_format);
581 if (!desc)
582 return AVERROR(EINVAL);
584 if (!(s->opts.r & 1)) {
585 s->opts.r |= 1;
586 av_log(ctx, AV_LOG_WARNING, "Research size should be odd, setting to %i",
587 s->opts.r);
590 if (!(s->opts.p & 1)) {
591 s->opts.p |= 1;
592 av_log(ctx, AV_LOG_WARNING, "Patch size should be odd, setting to %i",
593 s->opts.p);
596 for (int i = 0; i < 4; i++) {
597 double str = (s->opts.sc[i] > 1.0) ? s->opts.sc[i] : s->opts.s;
598 int ps = (s->opts.pc[i] ? s->opts.pc[i] : s->opts.p);
599 str = 10.0f*str;
600 str *= -str;
601 str = 255.0*255.0 / str;
602 s->strength[i] = str;
603 if (!(ps & 1)) {
604 ps |= 1;
605 av_log(ctx, AV_LOG_WARNING, "Patch size should be odd, setting to %i",
606 ps);
608 s->patch[i] = ps / 2;
611 rad = s->opts.r/2;
612 s->nb_offsets = (2*rad + 1)*(2*rad + 1) - 1;
613 s->xoffsets = av_malloc(s->nb_offsets*sizeof(*s->xoffsets));
614 s->yoffsets = av_malloc(s->nb_offsets*sizeof(*s->yoffsets));
615 s->nb_offsets = 0;
617 for (int x = -rad; x <= rad; x++) {
618 for (int y = -rad; y <= rad; y++) {
619 if (!x && !y)
620 continue;
622 s->xoffsets[xcnt++] = x;
623 s->yoffsets[ycnt++] = y;
624 s->nb_offsets++;
628 RET(ff_vk_create_buf(&s->vkctx, &s->xyoffsets_buf, 2*s->nb_offsets*sizeof(int32_t), NULL, NULL,
629 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT |
630 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
631 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
632 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
633 RET(ff_vk_map_buffer(&s->vkctx, &s->xyoffsets_buf, (uint8_t **)&offsets_buf, 0));
635 for (int i = 0; i < 2*s->nb_offsets; i += 2) {
636 offsets_buf[i + 0] = s->xoffsets[i >> 1];
637 offsets_buf[i + 1] = s->yoffsets[i >> 1];
640 RET(ff_vk_unmap_buffer(&s->vkctx, &s->xyoffsets_buf, 1));
642 s->opts.t = FFMIN(s->opts.t, (FFALIGN(s->nb_offsets, TYPE_ELEMS) / TYPE_ELEMS));
643 if (!vkctx->atomic_float_feats.shaderBufferFloat32AtomicAdd) {
644 av_log(ctx, AV_LOG_WARNING, "Device doesn't support atomic float adds, "
645 "disabling dispatch parallelism\n");
646 s->opts.t = 1;
649 spv = ff_vk_spirv_init();
650 if (!spv) {
651 av_log(ctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n");
652 return AVERROR_EXTERNAL;
655 s->qf = ff_vk_qf_find(vkctx, VK_QUEUE_COMPUTE_BIT, 0);
656 if (!s->qf) {
657 av_log(ctx, AV_LOG_ERROR, "Device has no compute queues\n");
658 err = AVERROR(ENOTSUP);
659 goto fail;
662 RET(ff_vk_exec_pool_init(vkctx, s->qf, &s->e, 1, 0, 0, 0, NULL));
663 RET(ff_vk_init_sampler(vkctx, &s->sampler, 1, VK_FILTER_NEAREST));
665 RET(init_weights_pipeline(vkctx, &s->e, &s->shd_weights, s->sampler,
666 spv, s->vkctx.output_width, s->vkctx.output_height,
667 s->opts.t, desc, planes, &s->pl_weights_rows));
669 RET(init_denoise_pipeline(vkctx, &s->e, &s->shd_denoise, s->sampler,
670 spv, desc, planes));
672 RET(ff_vk_shader_update_desc_buffer(vkctx, &s->e.contexts[0], &s->shd_weights,
673 1, 0, 0,
674 &s->xyoffsets_buf, 0, s->xyoffsets_buf.size,
675 VK_FORMAT_UNDEFINED));
677 do {
678 int wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, s->opts.t);
679 wg_invoc = FFMIN(wg_invoc, vkctx->props.properties.limits.maxComputeWorkGroupCount[2]);
680 offsets_dispatched += wg_invoc * TYPE_ELEMS;
681 nb_dispatches++;
682 } while (offsets_dispatched < s->nb_offsets);
684 av_log(ctx, AV_LOG_VERBOSE, "Filter initialized, %i x/y offsets, %i dispatches\n",
685 s->nb_offsets, nb_dispatches);
687 s->initialized = 1;
689 fail:
690 if (spv)
691 spv->uninit(&spv);
693 return err;
696 static int denoise_pass(NLMeansVulkanContext *s, FFVkExecContext *exec,
697 FFVkBuffer *ws_vk, uint32_t ws_stride[4])
699 FFVulkanContext *vkctx = &s->vkctx;
700 FFVulkanFunctions *vk = &vkctx->vkfn;
701 VkBufferMemoryBarrier2 buf_bar[8];
702 int nb_buf_bar = 0;
704 DenoisePushData pd = {
705 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
708 /* Denoise pass pipeline */
709 ff_vk_exec_bind_shader(vkctx, exec, &s->shd_denoise);
711 /* Push data */
712 ff_vk_shader_update_push_const(vkctx, exec, &s->shd_denoise,
713 VK_SHADER_STAGE_COMPUTE_BIT,
714 0, sizeof(pd), &pd);
716 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
717 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
718 .srcStageMask = ws_vk->stage,
719 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
720 .srcAccessMask = ws_vk->access,
721 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
722 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
723 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
724 .buffer = ws_vk->buf,
725 .size = ws_vk->size,
726 .offset = 0,
729 vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
730 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
731 .pBufferMemoryBarriers = buf_bar,
732 .bufferMemoryBarrierCount = nb_buf_bar,
734 ws_vk->stage = buf_bar[0].dstStageMask;
735 ws_vk->access = buf_bar[0].dstAccessMask;
737 /* End of denoise pass */
738 vk->CmdDispatch(exec->buf,
739 FFALIGN(vkctx->output_width, s->shd_denoise.lg_size[0])/s->shd_denoise.lg_size[0],
740 FFALIGN(vkctx->output_height, s->shd_denoise.lg_size[1])/s->shd_denoise.lg_size[1],
741 av_pix_fmt_count_planes(s->vkctx.output_format));
743 return 0;
746 static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in)
748 int err;
749 AVFrame *out = NULL;
750 AVFilterContext *ctx = link->dst;
751 NLMeansVulkanContext *s = ctx->priv;
752 AVFilterLink *outlink = ctx->outputs[0];
753 FFVulkanContext *vkctx = &s->vkctx;
754 FFVulkanFunctions *vk = &vkctx->vkfn;
756 const AVPixFmtDescriptor *desc;
757 int plane_widths[4];
758 int plane_heights[4];
760 int offsets_dispatched = 0;
762 /* Integral */
763 AVBufferRef *integral_buf = NULL;
764 FFVkBuffer *integral_vk;
765 size_t int_stride;
766 size_t int_size;
768 /* Weights/sums */
769 AVBufferRef *ws_buf = NULL;
770 FFVkBuffer *ws_vk;
771 VkDeviceSize weights_offs[4];
772 VkDeviceSize sums_offs[4];
773 uint32_t ws_stride[4];
774 size_t ws_size[4];
775 size_t ws_total_size = 0;
777 FFVkExecContext *exec;
778 VkImageView in_views[AV_NUM_DATA_POINTERS];
779 VkImageView out_views[AV_NUM_DATA_POINTERS];
780 VkImageMemoryBarrier2 img_bar[8];
781 int nb_img_bar = 0;
782 VkBufferMemoryBarrier2 buf_bar[8];
783 int nb_buf_bar = 0;
785 if (!s->initialized)
786 RET(init_filter(ctx));
788 desc = av_pix_fmt_desc_get(vkctx->output_format);
789 if (!desc)
790 return AVERROR(EINVAL);
792 /* Integral image */
793 int_stride = s->shd_weights.lg_size[0]*s->pl_weights_rows*TYPE_SIZE;
794 int_size = s->shd_weights.lg_size[0]*s->pl_weights_rows*int_stride;
796 /* Plane dimensions */
797 for (int i = 0; i < desc->nb_components; i++) {
798 plane_widths[i] = !i || (i == 3) ? vkctx->output_width : AV_CEIL_RSHIFT(vkctx->output_width, desc->log2_chroma_w);
799 plane_heights[i] = !i || (i == 3) ? vkctx->output_height : AV_CEIL_RSHIFT(vkctx->output_height, desc->log2_chroma_w);
800 plane_widths[i] = FFALIGN(plane_widths[i], s->shd_denoise.lg_size[0]);
801 plane_heights[i] = FFALIGN(plane_heights[i], s->shd_denoise.lg_size[1]);
803 ws_stride[i] = plane_widths[i];
804 ws_size[i] = ws_stride[i] * plane_heights[i] * sizeof(float);
805 ws_total_size += ws_size[i];
808 /* Buffers */
809 err = ff_vk_get_pooled_buffer(&s->vkctx, &s->integral_buf_pool, &integral_buf,
810 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
811 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
812 NULL,
813 s->opts.t * int_size,
814 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
815 if (err < 0)
816 return err;
817 integral_vk = (FFVkBuffer *)integral_buf->data;
819 err = ff_vk_get_pooled_buffer(&s->vkctx, &s->ws_buf_pool, &ws_buf,
820 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
821 VK_BUFFER_USAGE_TRANSFER_DST_BIT |
822 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
823 NULL,
824 ws_total_size * 2,
825 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
826 if (err < 0)
827 return err;
828 ws_vk = (FFVkBuffer *)ws_buf->data;
830 weights_offs[0] = 0;
831 sums_offs[0] = ws_total_size;
832 for (int i = 1; i < desc->nb_components; i++) {
833 weights_offs[i] = weights_offs[i - 1] + ws_size[i - 1];
834 sums_offs[i] = sums_offs[i - 1] + ws_size[i - 1];
837 /* Output frame */
838 out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
839 if (!out) {
840 err = AVERROR(ENOMEM);
841 goto fail;
844 /* Execution context */
845 exec = ff_vk_exec_get(&s->vkctx, &s->e);
846 ff_vk_exec_start(vkctx, exec);
848 /* Dependencies */
849 RET(ff_vk_exec_add_dep_frame(vkctx, exec, in,
850 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
851 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
852 RET(ff_vk_exec_add_dep_frame(vkctx, exec, out,
853 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
854 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
856 RET(ff_vk_exec_add_dep_buf(vkctx, exec, &integral_buf, 1, 0));
857 integral_buf = NULL;
859 RET(ff_vk_exec_add_dep_buf(vkctx, exec, &ws_buf, 1, 0));
860 ws_buf = NULL;
862 /* Input frame prep */
863 RET(ff_vk_create_imageviews(vkctx, exec, in_views, in, FF_VK_REP_FLOAT));
864 ff_vk_frame_barrier(vkctx, exec, in, img_bar, &nb_img_bar,
865 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
866 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
867 VK_ACCESS_SHADER_READ_BIT,
868 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
869 VK_QUEUE_FAMILY_IGNORED);
871 /* Output frame prep */
872 RET(ff_vk_create_imageviews(vkctx, exec, out_views, out, FF_VK_REP_FLOAT));
873 ff_vk_frame_barrier(vkctx, exec, out, img_bar, &nb_img_bar,
874 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
875 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
876 VK_ACCESS_SHADER_WRITE_BIT,
877 VK_IMAGE_LAYOUT_GENERAL,
878 VK_QUEUE_FAMILY_IGNORED);
880 nb_buf_bar = 0;
881 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
882 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
883 .srcStageMask = ws_vk->stage,
884 .dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT,
885 .srcAccessMask = ws_vk->access,
886 .dstAccessMask = VK_ACCESS_2_TRANSFER_WRITE_BIT,
887 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
888 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
889 .buffer = ws_vk->buf,
890 .size = ws_vk->size,
891 .offset = 0,
893 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
894 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
895 .srcStageMask = integral_vk->stage,
896 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
897 .srcAccessMask = integral_vk->access,
898 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
899 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
900 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
901 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
902 .buffer = integral_vk->buf,
903 .size = integral_vk->size,
904 .offset = 0,
907 vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
908 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
909 .pImageMemoryBarriers = img_bar,
910 .imageMemoryBarrierCount = nb_img_bar,
911 .pBufferMemoryBarriers = buf_bar,
912 .bufferMemoryBarrierCount = nb_buf_bar,
914 ws_vk->stage = buf_bar[0].dstStageMask;
915 ws_vk->access = buf_bar[0].dstAccessMask;
916 integral_vk->stage = buf_bar[1].dstStageMask;
917 integral_vk->access = buf_bar[1].dstAccessMask;
919 /* Buffer zeroing */
920 vk->CmdFillBuffer(exec->buf, ws_vk->buf, 0, ws_vk->size, 0x0);
922 nb_buf_bar = 0;
923 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
924 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
925 .srcStageMask = ws_vk->stage,
926 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
927 .srcAccessMask = ws_vk->access,
928 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
929 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
930 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
931 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
932 .buffer = ws_vk->buf,
933 .size = ws_vk->size,
934 .offset = 0,
937 vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
938 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
939 .pBufferMemoryBarriers = buf_bar,
940 .bufferMemoryBarrierCount = nb_buf_bar,
942 ws_vk->stage = buf_bar[0].dstStageMask;
943 ws_vk->access = buf_bar[0].dstAccessMask;
945 /* Update weights descriptors */
946 ff_vk_shader_update_img_array(vkctx, exec, &s->shd_weights, in, in_views, 0, 0,
947 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
948 s->sampler);
949 for (int i = 0; i < desc->nb_components; i++) {
950 RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_weights, 0, 1 + i*2 + 0, 0,
951 ws_vk, weights_offs[i], ws_size[i],
952 VK_FORMAT_UNDEFINED));
953 RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_weights, 0, 1 + i*2 + 1, 0,
954 ws_vk, sums_offs[i], ws_size[i],
955 VK_FORMAT_UNDEFINED));
958 /* Update denoise descriptors */
959 ff_vk_shader_update_img_array(vkctx, exec, &s->shd_denoise, in, in_views, 0, 0,
960 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
961 s->sampler);
962 ff_vk_shader_update_img_array(vkctx, exec, &s->shd_denoise, out, out_views, 0, 1,
963 VK_IMAGE_LAYOUT_GENERAL, s->sampler);
964 for (int i = 0; i < desc->nb_components; i++) {
965 RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_denoise, 1, i*2 + 0, 0,
966 ws_vk, weights_offs[i], ws_size[i],
967 VK_FORMAT_UNDEFINED));
968 RET(ff_vk_shader_update_desc_buffer(&s->vkctx, exec, &s->shd_denoise, 1, i*2 + 1, 0,
969 ws_vk, sums_offs[i], ws_size[i],
970 VK_FORMAT_UNDEFINED));
973 /* Weights pipeline */
974 ff_vk_exec_bind_shader(vkctx, exec, &s->shd_weights);
976 do {
977 int wg_invoc;
978 HorizontalPushData pd = {
979 { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] },
980 { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] },
981 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
982 { s->patch[0], s->patch[1], s->patch[2], s->patch[3] },
983 { s->strength[0], s->strength[1], s->strength[2], s->strength[2], },
984 integral_vk->address,
985 (uint64_t)int_size,
986 (uint64_t)int_stride,
987 offsets_dispatched,
990 /* Push data */
991 ff_vk_shader_update_push_const(vkctx, exec, &s->shd_weights,
992 VK_SHADER_STAGE_COMPUTE_BIT,
993 0, sizeof(pd), &pd);
995 if (offsets_dispatched) {
996 nb_buf_bar = 0;
997 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
998 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
999 .srcStageMask = integral_vk->stage,
1000 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
1001 .srcAccessMask = integral_vk->access,
1002 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
1003 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
1004 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
1005 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
1006 .buffer = integral_vk->buf,
1007 .size = integral_vk->size,
1008 .offset = 0,
1011 vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
1012 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
1013 .pBufferMemoryBarriers = buf_bar,
1014 .bufferMemoryBarrierCount = nb_buf_bar,
1016 integral_vk->stage = buf_bar[1].dstStageMask;
1017 integral_vk->access = buf_bar[1].dstAccessMask;
1020 wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, s->opts.t);
1021 wg_invoc = FFMIN(wg_invoc, vkctx->props.properties.limits.maxComputeWorkGroupCount[2]);
1023 /* End of horizontal pass */
1024 vk->CmdDispatch(exec->buf, 1, 1, wg_invoc);
1026 offsets_dispatched += wg_invoc * TYPE_ELEMS;
1027 } while (offsets_dispatched < s->nb_offsets);
1029 RET(denoise_pass(s, exec, ws_vk, ws_stride));
1031 err = ff_vk_exec_submit(vkctx, exec);
1032 if (err < 0)
1033 return err;
1035 err = av_frame_copy_props(out, in);
1036 if (err < 0)
1037 goto fail;
1039 av_frame_free(&in);
1041 return ff_filter_frame(outlink, out);
1043 fail:
1044 av_buffer_unref(&integral_buf);
1045 av_buffer_unref(&ws_buf);
1046 av_frame_free(&in);
1047 av_frame_free(&out);
1048 return err;
1051 static void nlmeans_vulkan_uninit(AVFilterContext *avctx)
1053 NLMeansVulkanContext *s = avctx->priv;
1054 FFVulkanContext *vkctx = &s->vkctx;
1055 FFVulkanFunctions *vk = &vkctx->vkfn;
1057 ff_vk_exec_pool_free(vkctx, &s->e);
1058 ff_vk_shader_free(vkctx, &s->shd_weights);
1059 ff_vk_shader_free(vkctx, &s->shd_denoise);
1061 av_buffer_pool_uninit(&s->integral_buf_pool);
1062 av_buffer_pool_uninit(&s->ws_buf_pool);
1064 if (s->sampler)
1065 vk->DestroySampler(vkctx->hwctx->act_dev, s->sampler,
1066 vkctx->hwctx->alloc);
1068 ff_vk_uninit(&s->vkctx);
1070 av_freep(&s->xoffsets);
1071 av_freep(&s->yoffsets);
1073 s->initialized = 0;
1076 #define OFFSET(x) offsetof(NLMeansVulkanContext, x)
1077 #define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
1078 static const AVOption nlmeans_vulkan_options[] = {
1079 { "s", "denoising strength for all components", OFFSET(opts.s), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
1080 { "p", "patch size for all components", OFFSET(opts.p), AV_OPT_TYPE_INT, { .i64 = 3*2+1 }, 0, 99, FLAGS },
1081 { "r", "research window radius", OFFSET(opts.r), AV_OPT_TYPE_INT, { .i64 = 7*2+1 }, 0, 99, FLAGS },
1082 { "t", "parallelism", OFFSET(opts.t), AV_OPT_TYPE_INT, { .i64 = 36 }, 1, 168, FLAGS },
1084 { "s1", "denoising strength for component 1", OFFSET(opts.sc[0]), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
1085 { "s2", "denoising strength for component 2", OFFSET(opts.sc[1]), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
1086 { "s3", "denoising strength for component 3", OFFSET(opts.sc[2]), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
1087 { "s4", "denoising strength for component 4", OFFSET(opts.sc[3]), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 100.0, FLAGS },
1089 { "p1", "patch size for component 1", OFFSET(opts.pc[0]), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS },
1090 { "p2", "patch size for component 2", OFFSET(opts.pc[1]), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS },
1091 { "p3", "patch size for component 3", OFFSET(opts.pc[2]), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS },
1092 { "p4", "patch size for component 4", OFFSET(opts.pc[3]), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS },
1094 { NULL }
1097 AVFILTER_DEFINE_CLASS(nlmeans_vulkan);
1099 static const AVFilterPad nlmeans_vulkan_inputs[] = {
1101 .name = "default",
1102 .type = AVMEDIA_TYPE_VIDEO,
1103 .filter_frame = &nlmeans_vulkan_filter_frame,
1104 .config_props = &ff_vk_filter_config_input,
1108 static const AVFilterPad nlmeans_vulkan_outputs[] = {
1110 .name = "default",
1111 .type = AVMEDIA_TYPE_VIDEO,
1112 .config_props = &ff_vk_filter_config_output,
1116 const FFFilter ff_vf_nlmeans_vulkan = {
1117 .p.name = "nlmeans_vulkan",
1118 .p.description = NULL_IF_CONFIG_SMALL("Non-local means denoiser (Vulkan)"),
1119 .p.priv_class = &nlmeans_vulkan_class,
1120 .p.flags = AVFILTER_FLAG_HWDEVICE,
1121 .priv_size = sizeof(NLMeansVulkanContext),
1122 .init = &ff_vk_filter_init,
1123 .uninit = &nlmeans_vulkan_uninit,
1124 FILTER_INPUTS(nlmeans_vulkan_inputs),
1125 FILTER_OUTPUTS(nlmeans_vulkan_outputs),
1126 FILTER_SINGLE_PIXFMT(AV_PIX_FMT_VULKAN),
1127 .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,