threadprogress: reorder instructions to fix race.
[ffmpeg.git] / libavcodec / x86 / simple_idct10_template.asm
blob0d04a9818a9a1daf4858afe3d2cba26c8be82d6f
1 ;******************************************************************************
2 ;* x86-SIMD-optimized IDCT for prores
3 ;* this is identical to "simple" IDCT written by Michael Niedermayer
4 ;* except for the clip range
5 ;*
6 ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
7 ;*
8 ;* This file is part of FFmpeg.
9 ;*
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 ; add SECTION_RODATA and proper include before including this file!
27 %if ARCH_X86_64
29 %macro define_constants 1
30 %undef w4_plus_w2
31 %undef w4_min_w2
32 %undef w4_plus_w6
33 %undef w4_min_w6
34 %undef w1_plus_w3
35 %undef w3_min_w1
36 %undef w7_plus_w3
37 %undef w3_min_w7
38 %define w4_plus_w2 w4_plus_w2%1
39 %define w4_min_w2 w4_min_w2%1
40 %define w4_plus_w6 w4_plus_w6%1
41 %define w4_min_w6 w4_min_w6%1
42 %define w1_plus_w3 w1_plus_w3%1
43 %define w3_min_w1 w3_min_w1%1
44 %define w7_plus_w3 w7_plus_w3%1
45 %define w3_min_w7 w3_min_w7%1
46 %endmacro
48 ; interleave data while maintaining source
49 ; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
50 %macro SBUTTERFLY3 5
51 punpckl%1 m%2, m%4, m%5
52 punpckh%1 m%3, m%4, m%5
53 %endmacro
55 ; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
56 ; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
57 ; %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
58 %macro SUMSUB_SHPK 7
59 psubd %3, %1, %5 ; { a0 - b0 }[0-3]
60 psubd %4, %2, %6 ; { a0 - b0 }[4-7]
61 paddd %1, %5 ; { a0 + b0 }[0-3]
62 paddd %2, %6 ; { a0 + b0 }[4-7]
63 psrad %1, %7
64 psrad %2, %7
65 psrad %3, %7
66 psrad %4, %7
67 packssdw %1, %2 ; row[0]
68 packssdw %3, %4 ; row[7]
69 %endmacro
71 ; %1 = initial bias ("" if nop)
72 ; %2 = number of bits to shift at the end
73 ; %3 = qmat (for prores)
74 %macro IDCT_1D 2-3
75 ; a0 = (W4 * row[0]) + (1 << (15 - 1));
76 ; a1 = a0;
77 ; a2 = a0;
78 ; a3 = a0;
79 ; a0 += W2 * row[2];
80 ; a1 += W6 * row[2];
81 ; a2 -= W6 * row[2];
82 ; a3 -= W2 * row[2];
83 %ifstr %1
84 mova m15, [pd_round_ %+ %2]
85 %else
86 paddw m10, [%1]
87 %endif
88 SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7]
89 pmaddwd m2, m0, [w4_plus_w6]
90 pmaddwd m3, m1, [w4_plus_w6]
91 pmaddwd m4, m0, [w4_min_w6]
92 pmaddwd m5, m1, [w4_min_w6]
93 pmaddwd m6, m0, [w4_min_w2]
94 pmaddwd m7, m1, [w4_min_w2]
95 pmaddwd m0, [w4_plus_w2]
96 pmaddwd m1, [w4_plus_w2]
97 %ifstr %1
98 ; Adding 1<<(%2-1) for >=15 bits values
99 paddd m2, m15
100 paddd m3, m15
101 paddd m4, m15
102 paddd m5, m15
103 paddd m6, m15
104 paddd m7, m15
105 paddd m0, m15
106 paddd m1, m15
107 %endif
109 ; a0: -1*row[0]-1*row[2]
110 ; a1: -1*row[0]
111 ; a2: -1*row[0]
112 ; a3: -1*row[0]+1*row[2]
114 ; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4]
115 ; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
116 ; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
117 ; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4]
118 SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
119 pmaddwd m10, m8, [w4_plus_w6]
120 pmaddwd m11, m9, [w4_plus_w6]
121 paddd m0, m10 ; a0[0-3]
122 paddd m1, m11 ; a0[4-7]
123 pmaddwd m10, m8, [w4_min_w6]
124 pmaddwd m11, m9, [w4_min_w6]
125 paddd m6, m10 ; a3[0-3]
126 paddd m7, m11 ; a3[4-7]
127 pmaddwd m10, m8, [w4_min_w2]
128 pmaddwd m11, m9, [w4_min_w2]
129 pmaddwd m8, [w4_plus_w2]
130 pmaddwd m9, [w4_plus_w2]
131 psubd m4, m10 ; a2[0-3] intermediate
132 psubd m5, m11 ; a2[4-7] intermediate
133 psubd m2, m8 ; a1[0-3] intermediate
134 psubd m3, m9 ; a1[4-7] intermediate
136 ; load/store
137 mova [blockq+ 0], m0
138 mova [blockq+ 32], m2
139 mova [blockq+ 64], m4
140 mova [blockq+ 96], m6
141 mova m10,[blockq+ 16] ; { row[1] }[0-7]
142 mova m8, [blockq+ 48] ; { row[3] }[0-7]
143 mova m13,[blockq+ 80] ; { row[5] }[0-7]
144 mova m14,[blockq+112] ; { row[7] }[0-7]
145 mova [blockq+ 16], m1
146 mova [blockq+ 48], m3
147 mova [blockq+ 80], m5
148 mova [blockq+112], m7
149 %if %0 == 3
150 pmullw m10,[%3+ 16]
151 pmullw m8, [%3+ 48]
152 pmullw m13,[%3+ 80]
153 pmullw m14,[%3+112]
154 %endif
156 ; b0 = MUL(W1, row[1]);
157 ; MAC(b0, W3, row[3]);
158 ; b1 = MUL(W3, row[1]);
159 ; MAC(b1, -W7, row[3]);
160 ; b2 = MUL(W5, row[1]);
161 ; MAC(b2, -W1, row[3]);
162 ; b3 = MUL(W7, row[1]);
163 ; MAC(b3, -W5, row[3]);
164 SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7]
165 pmaddwd m2, m0, [w3_min_w7]
166 pmaddwd m3, m1, [w3_min_w7]
167 pmaddwd m4, m0, [w5_min_w1]
168 pmaddwd m5, m1, [w5_min_w1]
169 pmaddwd m6, m0, [w7_min_w5]
170 pmaddwd m7, m1, [w7_min_w5]
171 pmaddwd m0, [w1_plus_w3]
172 pmaddwd m1, [w1_plus_w3]
174 ; b0: +1*row[1]+2*row[3]
175 ; b1: +2*row[1]-1*row[3]
176 ; b2: -1*row[1]-1*row[3]
177 ; b3: +1*row[1]+1*row[3]
179 ; MAC(b0, W5, row[5]);
180 ; MAC(b0, W7, row[7]);
181 ; MAC(b1, -W1, row[5]);
182 ; MAC(b1, -W5, row[7]);
183 ; MAC(b2, W7, row[5]);
184 ; MAC(b2, W3, row[7]);
185 ; MAC(b3, W3, row[5]);
186 ; MAC(b3, -W1, row[7]);
187 SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
189 ; b0: -1*row[5]+1*row[7]
190 ; b1: -1*row[5]+1*row[7]
191 ; b2: +1*row[5]+2*row[7]
192 ; b3: +2*row[5]-1*row[7]
194 pmaddwd m10, m8, [w1_plus_w5]
195 pmaddwd m11, m9, [w1_plus_w5]
196 pmaddwd m12, m8, [w5_plus_w7]
197 pmaddwd m13, m9, [w5_plus_w7]
198 psubd m2, m10 ; b1[0-3]
199 psubd m3, m11 ; b1[4-7]
200 paddd m0, m12 ; b0[0-3]
201 paddd m1, m13 ; b0[4-7]
202 pmaddwd m12, m8, [w7_plus_w3]
203 pmaddwd m13, m9, [w7_plus_w3]
204 pmaddwd m8, [w3_min_w1]
205 pmaddwd m9, [w3_min_w1]
206 paddd m4, m12 ; b2[0-3]
207 paddd m5, m13 ; b2[4-7]
208 paddd m6, m8 ; b3[0-3]
209 paddd m7, m9 ; b3[4-7]
211 ; row[0] = (a0 + b0) >> 15;
212 ; row[7] = (a0 - b0) >> 15;
213 ; row[1] = (a1 + b1) >> 15;
214 ; row[6] = (a1 - b1) >> 15;
215 ; row[2] = (a2 + b2) >> 15;
216 ; row[5] = (a2 - b2) >> 15;
217 ; row[3] = (a3 + b3) >> 15;
218 ; row[4] = (a3 - b3) >> 15;
219 mova m8, [blockq+ 0] ; a0[0-3]
220 mova m9, [blockq+16] ; a0[4-7]
221 SUMSUB_SHPK m8, m9, m10, m11, m0, m1, %2
222 mova m0, [blockq+32] ; a1[0-3]
223 mova m1, [blockq+48] ; a1[4-7]
224 SUMSUB_SHPK m0, m1, m9, m11, m2, m3, %2
225 mova m1, [blockq+64] ; a2[0-3]
226 mova m2, [blockq+80] ; a2[4-7]
227 SUMSUB_SHPK m1, m2, m11, m3, m4, m5, %2
228 mova m2, [blockq+96] ; a3[0-3]
229 mova m3, [blockq+112] ; a3[4-7]
230 SUMSUB_SHPK m2, m3, m4, m5, m6, m7, %2
231 %endmacro
233 ; void ff_prores_idct_put_10_<opt>(uint8_t *pixels, ptrdiff_t stride,
234 ; int16_t *block, const int16_t *qmat);
236 ; %1 = row shift
237 ; %2 = row bias macro
238 ; %3 = column shift
239 ; %4 = column bias macro
240 ; %5 = final action (nothing, "store", "put", "add")
241 ; %6 = min pixel value
242 ; %7 = max pixel value
243 ; %8 = qmat (for prores)
245 %macro IDCT_FN 4-8
246 ; for (i = 0; i < 8; i++)
247 ; idctRowCondDC(block + i*8);
248 mova m10,[blockq+ 0] ; { row[0] }[0-7]
249 mova m8, [blockq+32] ; { row[2] }[0-7]
250 mova m13,[blockq+64] ; { row[4] }[0-7]
251 mova m12,[blockq+96] ; { row[6] }[0-7]
253 %if %0 == 8
254 pmullw m10,[%8+ 0]
255 pmullw m8, [%8+32]
256 pmullw m13,[%8+64]
257 pmullw m12,[%8+96]
259 IDCT_1D %1, %2, %8
260 %elif %2 == 11
261 ; This copies the DC-only shortcut. When there is only a DC coefficient the
262 ; C shifts the value and splats it to all coeffs rather than multiplying and
263 ; doing the full IDCT. This causes a difference on 8-bit because the
264 ; coefficient is 16383 rather than 16384 (which you can get with shifting).
265 por m1, m8, m13
266 por m1, m12
267 por m1, [blockq+ 16] ; { row[1] }[0-7]
268 por m1, [blockq+ 48] ; { row[3] }[0-7]
269 por m1, [blockq+ 80] ; { row[5] }[0-7]
270 por m1, [blockq+112] ; { row[7] }[0-7]
271 pxor m2, m2
272 pcmpeqw m1, m2
273 psllw m2, m10, 3
274 pand m2, m1
275 pcmpeqb m3, m3
276 pxor m1, m3
277 mova [rsp], m1
278 mova [rsp+16], m2
280 IDCT_1D %1, %2
282 mova m5, [rsp]
283 mova m6, [rsp+16]
284 pand m8, m5
285 por m8, m6
286 pand m0, m5
287 por m0, m6
288 pand m1, m5
289 por m1, m6
290 pand m2, m5
291 por m2, m6
292 pand m4, m5
293 por m4, m6
294 pand m11, m5
295 por m11, m6
296 pand m9, m5
297 por m9, m6
298 pand m10, m5
299 por m10, m6
300 %else
301 IDCT_1D %1, %2
302 %endif
304 ; transpose for second part of IDCT
305 TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
306 mova [blockq+ 16], m0
307 mova [blockq+ 48], m2
308 mova [blockq+ 80], m11
309 mova [blockq+112], m10
310 SWAP 8, 10
311 SWAP 1, 8
312 SWAP 4, 13
313 SWAP 9, 12
315 ; for (i = 0; i < 8; i++)
316 ; idctSparseColAdd(dest + i, line_size, block + i);
317 IDCT_1D %3, %4
319 ; clip/store
320 %if %0 >= 5
321 %ifidn %5,"store"
322 ; No clamping, means pure idct
323 mova [blockq+ 0], m8
324 mova [blockq+ 16], m0
325 mova [blockq+ 32], m1
326 mova [blockq+ 48], m2
327 mova [blockq+ 64], m4
328 mova [blockq+ 80], m11
329 mova [blockq+ 96], m9
330 mova [blockq+112], m10
331 %elifidn %5,"put"
332 %ifidn %6, 0
333 pxor m3, m3
334 %else
335 mova m3, [%6]
336 %endif ; ifidn %6, 0
337 mova m5, [%7]
338 pmaxsw m8, m3
339 pmaxsw m0, m3
340 pmaxsw m1, m3
341 pmaxsw m2, m3
342 pmaxsw m4, m3
343 pmaxsw m11, m3
344 pmaxsw m9, m3
345 pmaxsw m10, m3
346 pminsw m8, m5
347 pminsw m0, m5
348 pminsw m1, m5
349 pminsw m2, m5
350 pminsw m4, m5
351 pminsw m11, m5
352 pminsw m9, m5
353 pminsw m10, m5
355 lea r2, [r1*3]
356 mova [r0 ], m8
357 mova [r0+r1 ], m0
358 mova [r0+r1*2], m1
359 mova [r0+r2 ], m2
360 lea r0, [r0+r1*4]
361 mova [r0 ], m4
362 mova [r0+r1 ], m11
363 mova [r0+r1*2], m9
364 mova [r0+r2 ], m10
365 %endif ; %5 action
366 %endif; if %0 >= 5
367 %endmacro
369 %endif