aarch64: Add assembly support for -fsanitize=hwaddress tagged globals.
[libav.git] / libavcodec / x86 / hevc_add_res.asm
blob4701e9d45d6b5ce654d81a0455e46c9194ed11c6
1 ; *****************************************************************************
2 ; * Provide SIMD optimizations for add_residual functions for HEVC decoding
3 ; * Copyright (c) 2014 Pierre-Edouard LEPERE
4 ; *
5 ; * This file is part of Libav.
6 ; *
7 ; * Libav is free software; you can redistribute it and/or
8 ; * modify it under the terms of the GNU Lesser General Public
9 ; * License as published by the Free Software Foundation; either
10 ; * version 2.1 of the License, or (at your option) any later version.
11 ; *
12 ; * Libav is distributed in the hope that it will be useful,
13 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ; * Lesser General Public License for more details.
16 ; *
17 ; * You should have received a copy of the GNU Lesser General Public
18 ; * License along with Libav; if not, write to the Free Software
19 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ; ******************************************************************************
22 %include "libavutil/x86/x86util.asm"
24 SECTION_RODATA 32
25 max_pixels_10: times 16 dw ((1 << 10)-1)
27 SECTION .text
29 ; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
30 %macro ADD_RES_MMX_4_8 0
31 mova m0, [r1]
32 mova m2, [r1+8]
33 pxor m1, m1
34 pxor m3, m3
35 psubw m1, m0
36 psubw m3, m2
37 packuswb m0, m2
38 packuswb m1, m3
40 movd m2, [r0]
41 movd m3, [r0+r2]
42 punpckldq m2, m3
43 paddusb m0, m2
44 psubusb m0, m1
45 movd [r0], m0
46 psrlq m0, 32
47 movd [r0+r2], m0
48 %endmacro
51 INIT_MMX mmxext
52 ; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride)
53 cglobal hevc_add_residual_4_8, 3, 3, 6
54 ADD_RES_MMX_4_8
55 add r1, 16
56 lea r0, [r0+r2*2]
57 ADD_RES_MMX_4_8
58 RET
60 %macro ADD_RES_SSE_8_8 0
61 pxor m3, m3
62 mova m4, [r1]
63 mova m6, [r1+16]
64 mova m0, [r1+32]
65 mova m2, [r1+48]
66 psubw m5, m3, m4
67 psubw m7, m3, m6
68 psubw m1, m3, m0
69 packuswb m4, m0
70 packuswb m5, m1
71 psubw m3, m2
72 packuswb m6, m2
73 packuswb m7, m3
75 movq m0, [r0]
76 movq m1, [r0+r2]
77 movhps m0, [r0+r2*2]
78 movhps m1, [r0+r3]
79 paddusb m0, m4
80 paddusb m1, m6
81 psubusb m0, m5
82 psubusb m1, m7
83 movq [r0], m0
84 movq [r0+r2], m1
85 movhps [r0+2*r2], m0
86 movhps [r0+r3], m1
87 %endmacro
89 %macro ADD_RES_SSE_16_32_8 3
90 mova xm2, [r1+%1]
91 mova xm6, [r1+%1+16]
92 %if cpuflag(avx2)
93 vinserti128 m2, m2, [r1+%1+32], 1
94 vinserti128 m6, m6, [r1+%1+48], 1
95 %endif
96 psubw m1, m0, m2
97 psubw m5, m0, m6
98 packuswb m2, m6
99 packuswb m1, m5
101 mova xm4, [r1+%1+mmsize*2]
102 mova xm6, [r1+%1+mmsize*2+16]
103 %if cpuflag(avx2)
104 vinserti128 m4, m4, [r1+%1+96 ], 1
105 vinserti128 m6, m6, [r1+%1+112], 1
106 %endif
107 psubw m3, m0, m4
108 psubw m5, m0, m6
109 packuswb m4, m6
110 packuswb m3, m5
112 paddusb m2, [%2]
113 paddusb m4, [%3]
114 psubusb m2, m1
115 psubusb m4, m3
116 mova [%2], m2
117 mova [%3], m4
118 %endmacro
121 %macro TRANSFORM_ADD_8 0
122 ; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
123 cglobal hevc_add_residual_8_8, 3, 4, 8
124 lea r3, [r2*3]
125 ADD_RES_SSE_8_8
126 add r1, 64
127 lea r0, [r0+r2*4]
128 ADD_RES_SSE_8_8
131 ; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
132 cglobal hevc_add_residual_16_8, 3, 5, 7
133 pxor m0, m0
134 lea r3, [r2*3]
135 mov r4d, 4
136 .loop:
137 ADD_RES_SSE_16_32_8 0, r0, r0+r2
138 ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3
139 add r1, 128
140 lea r0, [r0+r2*4]
141 dec r4d
142 jg .loop
145 ; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
146 cglobal hevc_add_residual_32_8, 3, 5, 7
147 pxor m0, m0
148 mov r4d, 16
149 .loop:
150 ADD_RES_SSE_16_32_8 0, r0, r0+16
151 ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16
152 add r1, 128
153 lea r0, [r0+r2*2]
154 dec r4d
155 jg .loop
157 %endmacro
159 INIT_XMM sse2
160 TRANSFORM_ADD_8
161 INIT_XMM avx
162 TRANSFORM_ADD_8
164 %if HAVE_AVX2_EXTERNAL
165 INIT_YMM avx2
166 ; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
167 cglobal hevc_add_residual_32_8, 3, 5, 7
168 pxor m0, m0
169 lea r3, [r2*3]
170 mov r4d, 8
171 .loop:
172 ADD_RES_SSE_16_32_8 0, r0, r0+r2
173 ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3
174 add r1, 256
175 lea r0, [r0+r2*4]
176 dec r4d
177 jg .loop
179 %endif ;HAVE_AVX2_EXTERNAL
181 %macro ADD_RES_SSE_8_10 4
182 mova m0, [%4]
183 mova m1, [%4+16]
184 mova m2, [%4+32]
185 mova m3, [%4+48]
186 paddw m0, [%1+0]
187 paddw m1, [%1+%2]
188 paddw m2, [%1+%2*2]
189 paddw m3, [%1+%3]
190 CLIPW m0, m4, m5
191 CLIPW m1, m4, m5
192 CLIPW m2, m4, m5
193 CLIPW m3, m4, m5
194 mova [%1+0], m0
195 mova [%1+%2], m1
196 mova [%1+%2*2], m2
197 mova [%1+%3], m3
198 %endmacro
200 %macro ADD_RES_MMX_4_10 3
201 mova m0, [%1+0]
202 mova m1, [%1+%2]
203 paddw m0, [%3]
204 paddw m1, [%3+8]
205 CLIPW m0, m2, m3
206 CLIPW m1, m2, m3
207 mova [%1+0], m0
208 mova [%1+%2], m1
209 %endmacro
211 %macro ADD_RES_SSE_16_10 3
212 mova m0, [%3]
213 mova m1, [%3+16]
214 mova m2, [%3+32]
215 mova m3, [%3+48]
216 paddw m0, [%1]
217 paddw m1, [%1+16]
218 paddw m2, [%1+%2]
219 paddw m3, [%1+%2+16]
220 CLIPW m0, m4, m5
221 CLIPW m1, m4, m5
222 CLIPW m2, m4, m5
223 CLIPW m3, m4, m5
224 mova [%1], m0
225 mova [%1+16], m1
226 mova [%1+%2], m2
227 mova [%1+%2+16], m3
228 %endmacro
230 %macro ADD_RES_SSE_32_10 2
231 mova m0, [%2]
232 mova m1, [%2+16]
233 mova m2, [%2+32]
234 mova m3, [%2+48]
236 paddw m0, [%1]
237 paddw m1, [%1+16]
238 paddw m2, [%1+32]
239 paddw m3, [%1+48]
240 CLIPW m0, m4, m5
241 CLIPW m1, m4, m5
242 CLIPW m2, m4, m5
243 CLIPW m3, m4, m5
244 mova [%1], m0
245 mova [%1+16], m1
246 mova [%1+32], m2
247 mova [%1+48], m3
248 %endmacro
250 %macro ADD_RES_AVX2_16_10 4
251 mova m0, [%4]
252 mova m1, [%4+32]
253 mova m2, [%4+64]
254 mova m3, [%4+96]
256 paddw m0, [%1+0]
257 paddw m1, [%1+%2]
258 paddw m2, [%1+%2*2]
259 paddw m3, [%1+%3]
261 CLIPW m0, m4, m5
262 CLIPW m1, m4, m5
263 CLIPW m2, m4, m5
264 CLIPW m3, m4, m5
265 mova [%1+0], m0
266 mova [%1+%2], m1
267 mova [%1+%2*2], m2
268 mova [%1+%3], m3
269 %endmacro
271 %macro ADD_RES_AVX2_32_10 3
272 mova m0, [%3]
273 mova m1, [%3+32]
274 mova m2, [%3+64]
275 mova m3, [%3+96]
277 paddw m0, [%1]
278 paddw m1, [%1+32]
279 paddw m2, [%1+%2]
280 paddw m3, [%1+%2+32]
282 CLIPW m0, m4, m5
283 CLIPW m1, m4, m5
284 CLIPW m2, m4, m5
285 CLIPW m3, m4, m5
286 mova [%1], m0
287 mova [%1+32], m1
288 mova [%1+%2], m2
289 mova [%1+%2+32], m3
290 %endmacro
292 ; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride)
293 INIT_MMX mmxext
294 cglobal hevc_add_residual_4_10, 3, 3, 6
295 pxor m2, m2
296 mova m3, [max_pixels_10]
297 ADD_RES_MMX_4_10 r0, r2, r1
298 add r1, 16
299 lea r0, [r0+2*r2]
300 ADD_RES_MMX_4_10 r0, r2, r1
303 INIT_XMM sse2
304 cglobal hevc_add_residual_8_10, 3, 4, 6
305 pxor m4, m4
306 mova m5, [max_pixels_10]
307 lea r3, [r2*3]
309 ADD_RES_SSE_8_10 r0, r2, r3, r1
310 lea r0, [r0+r2*4]
311 add r1, 64
312 ADD_RES_SSE_8_10 r0, r2, r3, r1
315 cglobal hevc_add_residual_16_10, 3, 5, 6
316 pxor m4, m4
317 mova m5, [max_pixels_10]
319 mov r4d, 8
320 .loop:
321 ADD_RES_SSE_16_10 r0, r2, r1
322 lea r0, [r0+r2*2]
323 add r1, 64
324 dec r4d
325 jg .loop
328 cglobal hevc_add_residual_32_10, 3, 5, 6
329 pxor m4, m4
330 mova m5, [max_pixels_10]
332 mov r4d, 32
333 .loop:
334 ADD_RES_SSE_32_10 r0, r1
335 lea r0, [r0+r2]
336 add r1, 64
337 dec r4d
338 jg .loop
341 %if HAVE_AVX2_EXTERNAL
342 INIT_YMM avx2
343 cglobal hevc_add_residual_16_10, 3, 5, 6
344 pxor m4, m4
345 mova m5, [max_pixels_10]
346 lea r3, [r2*3]
348 mov r4d, 4
349 .loop:
350 ADD_RES_AVX2_16_10 r0, r2, r3, r1
351 lea r0, [r0+r2*4]
352 add r1, 128
353 dec r4d
354 jg .loop
357 cglobal hevc_add_residual_32_10, 3, 5, 6
358 pxor m4, m4
359 mova m5, [max_pixels_10]
361 mov r4d, 16
362 .loop:
363 ADD_RES_AVX2_32_10 r0, r2, r1
364 lea r0, [r0+r2*2]
365 add r1, 128
366 dec r4d
367 jg .loop
369 %endif ;HAVE_AVX2_EXTERNAL