2 * MPEG video MMX templates
4 * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
33 #define MMREG_WIDTH "16"
37 "pshuflw $0, "a", "a" \n\t"\
38 "punpcklwd "a", "a" \n\t"
39 #define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
41 "movhlps "a", "b" \n\t"\
43 "pshuflw $0x0E, "a", "b" \n\t"\
45 "pshuflw $0x01, "a", "b" \n\t"\
48 #define MMREG_WIDTH "8"
52 #define SPREADW(a) "pshufw $0, "a", "a" \n\t"
53 #define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
55 "pshufw $0x0E, "a", "b" \n\t"\
57 "pshufw $0x01, "a", "b" \n\t"\
61 "punpcklwd "a", "a" \n\t"\
62 "punpcklwd "a", "a" \n\t"
64 "psubusw "a", "b" \n\t"\
68 "psrlq $32, "a" \n\t"\
71 "psrlq $16, "a" \n\t"\
78 #define SAVE_SIGN(a,b) \
79 "movdqa "b", "a" \n\t"\
81 #define RESTORE_SIGN(a,b) \
82 "psignw "a", "b" \n\t"
84 #define SAVE_SIGN(a,b) \
86 "pcmpgtw "b", "a" \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\
88 "psubw "a", "b" \n\t" /* ABS(block[i]) */
89 #define RESTORE_SIGN(a,b) \
91 "psubw "a", "b" \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
94 static int RENAME(dct_quantize
)(MpegEncContext
*s
,
95 DCTELEM
*block
, int n
,
96 int qscale
, int *overflow
)
98 x86_reg last_non_zero_p1
;
99 int level
=0, q
; //=0 is because gcc says uninitialized ...
100 const uint16_t *qmat
, *bias
;
101 DECLARE_ALIGNED_16(int16_t, temp_block
[64]);
103 assert((7&(int)(&temp_block
[0])) == 0); //did gcc align it correctly?
106 RENAMEl(ff_fdct
) (block
); //cannot be anything else ...
109 s
->denoise_dct(s
, block
);
117 /* note: block[0] is assumed to be positive */
122 : "=d" (level
), "=a"(dummy
)
123 : "a" ((block
[0]>>2) + q
), "c" (ff_inverse
[q
<<1])
127 "xorl %%edx, %%edx \n\t"
129 "movzwl %%ax, %%eax \n\t"
131 : "a" ((block
[0]>>2) + q
), "c" (q
<<1)
136 /* For AIC we skip quant/dequant of INTRADC */
137 level
= (block
[0] + 4)>>3;
139 block
[0]=0; //avoid fake overflow
140 // temp_block[0] = (block[0] + (q >> 1)) / q;
141 last_non_zero_p1
= 1;
142 bias
= s
->q_intra_matrix16
[qscale
][1];
143 qmat
= s
->q_intra_matrix16
[qscale
][0];
145 last_non_zero_p1
= 0;
146 bias
= s
->q_inter_matrix16
[qscale
][1];
147 qmat
= s
->q_inter_matrix16
[qscale
][0];
150 if((s
->out_format
== FMT_H263
|| s
->out_format
== FMT_H261
) && s
->mpeg_quant
==0){
153 "movd %%"REG_a
", "MM
"3 \n\t" // last_non_zero_p1
155 "pxor "MM
"7, "MM
"7 \n\t" // 0
156 "pxor "MM
"4, "MM
"4 \n\t" // 0
157 MOVQ
" (%2), "MM
"5 \n\t" // qmat[0]
158 "pxor "MM
"6, "MM
"6 \n\t"
159 "psubw (%3), "MM
"6 \n\t" // -bias[0]
160 "mov $-128, %%"REG_a
" \n\t"
163 MOVQ
" (%1, %%"REG_a
"), "MM
"0 \n\t" // block[i]
164 SAVE_SIGN(MM
"1", MM
"0") // ABS(block[i])
165 "psubusw "MM
"6, "MM
"0 \n\t" // ABS(block[i]) + bias[0]
166 "pmulhw "MM
"5, "MM
"0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
167 "por "MM
"0, "MM
"4 \n\t"
168 RESTORE_SIGN(MM
"1", MM
"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
169 MOVQ
" "MM
"0, (%5, %%"REG_a
") \n\t"
170 "pcmpeqw "MM
"7, "MM
"0 \n\t" // out==0 ? 0xFF : 0x00
171 MOVQ
" (%4, %%"REG_a
"), "MM
"1 \n\t"
172 MOVQ
" "MM
"7, (%1, %%"REG_a
") \n\t" // 0
173 "pandn "MM
"1, "MM
"0 \n\t"
175 "add $"MMREG_WIDTH
", %%"REG_a
" \n\t"
178 "movd "MM
"3, %%"REG_a
" \n\t"
179 "movzb %%al, %%"REG_a
" \n\t" // last_non_zero_p1
180 : "+a" (last_non_zero_p1
)
181 : "r" (block
+64), "r" (qmat
), "r" (bias
),
182 "r" (inv_zigzag_direct16
+64), "r" (temp_block
+64)
186 "movd %%"REG_a
", "MM
"3 \n\t" // last_non_zero_p1
188 "pxor "MM
"7, "MM
"7 \n\t" // 0
189 "pxor "MM
"4, "MM
"4 \n\t" // 0
190 "mov $-128, %%"REG_a
" \n\t"
193 MOVQ
" (%1, %%"REG_a
"), "MM
"0 \n\t" // block[i]
194 SAVE_SIGN(MM
"1", MM
"0") // ABS(block[i])
195 MOVQ
" (%3, %%"REG_a
"), "MM
"6 \n\t" // bias[0]
196 "paddusw "MM
"6, "MM
"0 \n\t" // ABS(block[i]) + bias[0]
197 MOVQ
" (%2, %%"REG_a
"), "MM
"5 \n\t" // qmat[i]
198 "pmulhw "MM
"5, "MM
"0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
199 "por "MM
"0, "MM
"4 \n\t"
200 RESTORE_SIGN(MM
"1", MM
"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
201 MOVQ
" "MM
"0, (%5, %%"REG_a
") \n\t"
202 "pcmpeqw "MM
"7, "MM
"0 \n\t" // out==0 ? 0xFF : 0x00
203 MOVQ
" (%4, %%"REG_a
"), "MM
"1 \n\t"
204 MOVQ
" "MM
"7, (%1, %%"REG_a
") \n\t" // 0
205 "pandn "MM
"1, "MM
"0 \n\t"
207 "add $"MMREG_WIDTH
", %%"REG_a
" \n\t"
210 "movd "MM
"3, %%"REG_a
" \n\t"
211 "movzb %%al, %%"REG_a
" \n\t" // last_non_zero_p1
212 : "+a" (last_non_zero_p1
)
213 : "r" (block
+64), "r" (qmat
+64), "r" (bias
+64),
214 "r" (inv_zigzag_direct16
+64), "r" (temp_block
+64)
218 "movd %1, "MM
"1 \n\t" // max_qcoeff
220 "psubusw "MM
"1, "MM
"4 \n\t"
221 "packuswb "MM
"4, "MM
"4 \n\t"
223 "packuswb "MM
"4, "MM
"4 \n\t"
225 "movd "MM
"4, %0 \n\t" // *overflow
227 : "g" (s
->max_qcoeff
)
230 if(s
->mb_intra
) block
[0]= level
;
231 else block
[0]= temp_block
[0];
233 if(s
->dsp
.idct_permutation_type
== FF_SIMPLE_IDCT_PERM
){
234 if(last_non_zero_p1
<= 1) goto end
;
235 block
[0x08] = temp_block
[0x01]; block
[0x10] = temp_block
[0x08];
236 block
[0x20] = temp_block
[0x10];
237 if(last_non_zero_p1
<= 4) goto end
;
238 block
[0x18] = temp_block
[0x09]; block
[0x04] = temp_block
[0x02];
239 block
[0x09] = temp_block
[0x03];
240 if(last_non_zero_p1
<= 7) goto end
;
241 block
[0x14] = temp_block
[0x0A]; block
[0x28] = temp_block
[0x11];
242 block
[0x12] = temp_block
[0x18]; block
[0x02] = temp_block
[0x20];
243 if(last_non_zero_p1
<= 11) goto end
;
244 block
[0x1A] = temp_block
[0x19]; block
[0x24] = temp_block
[0x12];
245 block
[0x19] = temp_block
[0x0B]; block
[0x01] = temp_block
[0x04];
246 block
[0x0C] = temp_block
[0x05];
247 if(last_non_zero_p1
<= 16) goto end
;
248 block
[0x11] = temp_block
[0x0C]; block
[0x29] = temp_block
[0x13];
249 block
[0x16] = temp_block
[0x1A]; block
[0x0A] = temp_block
[0x21];
250 block
[0x30] = temp_block
[0x28]; block
[0x22] = temp_block
[0x30];
251 block
[0x38] = temp_block
[0x29]; block
[0x06] = temp_block
[0x22];
252 if(last_non_zero_p1
<= 24) goto end
;
253 block
[0x1B] = temp_block
[0x1B]; block
[0x21] = temp_block
[0x14];
254 block
[0x1C] = temp_block
[0x0D]; block
[0x05] = temp_block
[0x06];
255 block
[0x0D] = temp_block
[0x07]; block
[0x15] = temp_block
[0x0E];
256 block
[0x2C] = temp_block
[0x15]; block
[0x13] = temp_block
[0x1C];
257 if(last_non_zero_p1
<= 32) goto end
;
258 block
[0x0B] = temp_block
[0x23]; block
[0x34] = temp_block
[0x2A];
259 block
[0x2A] = temp_block
[0x31]; block
[0x32] = temp_block
[0x38];
260 block
[0x3A] = temp_block
[0x39]; block
[0x26] = temp_block
[0x32];
261 block
[0x39] = temp_block
[0x2B]; block
[0x03] = temp_block
[0x24];
262 if(last_non_zero_p1
<= 40) goto end
;
263 block
[0x1E] = temp_block
[0x1D]; block
[0x25] = temp_block
[0x16];
264 block
[0x1D] = temp_block
[0x0F]; block
[0x2D] = temp_block
[0x17];
265 block
[0x17] = temp_block
[0x1E]; block
[0x0E] = temp_block
[0x25];
266 block
[0x31] = temp_block
[0x2C]; block
[0x2B] = temp_block
[0x33];
267 if(last_non_zero_p1
<= 48) goto end
;
268 block
[0x36] = temp_block
[0x3A]; block
[0x3B] = temp_block
[0x3B];
269 block
[0x23] = temp_block
[0x34]; block
[0x3C] = temp_block
[0x2D];
270 block
[0x07] = temp_block
[0x26]; block
[0x1F] = temp_block
[0x1F];
271 block
[0x0F] = temp_block
[0x27]; block
[0x35] = temp_block
[0x2E];
272 if(last_non_zero_p1
<= 56) goto end
;
273 block
[0x2E] = temp_block
[0x35]; block
[0x33] = temp_block
[0x3C];
274 block
[0x3E] = temp_block
[0x3D]; block
[0x27] = temp_block
[0x36];
275 block
[0x3D] = temp_block
[0x2F]; block
[0x2F] = temp_block
[0x37];
276 block
[0x37] = temp_block
[0x3E]; block
[0x3F] = temp_block
[0x3F];
277 }else if(s
->dsp
.idct_permutation_type
== FF_LIBMPEG2_IDCT_PERM
){
278 if(last_non_zero_p1
<= 1) goto end
;
279 block
[0x04] = temp_block
[0x01];
280 block
[0x08] = temp_block
[0x08]; block
[0x10] = temp_block
[0x10];
281 if(last_non_zero_p1
<= 4) goto end
;
282 block
[0x0C] = temp_block
[0x09]; block
[0x01] = temp_block
[0x02];
283 block
[0x05] = temp_block
[0x03];
284 if(last_non_zero_p1
<= 7) goto end
;
285 block
[0x09] = temp_block
[0x0A]; block
[0x14] = temp_block
[0x11];
286 block
[0x18] = temp_block
[0x18]; block
[0x20] = temp_block
[0x20];
287 if(last_non_zero_p1
<= 11) goto end
;
288 block
[0x1C] = temp_block
[0x19];
289 block
[0x11] = temp_block
[0x12]; block
[0x0D] = temp_block
[0x0B];
290 block
[0x02] = temp_block
[0x04]; block
[0x06] = temp_block
[0x05];
291 if(last_non_zero_p1
<= 16) goto end
;
292 block
[0x0A] = temp_block
[0x0C]; block
[0x15] = temp_block
[0x13];
293 block
[0x19] = temp_block
[0x1A]; block
[0x24] = temp_block
[0x21];
294 block
[0x28] = temp_block
[0x28]; block
[0x30] = temp_block
[0x30];
295 block
[0x2C] = temp_block
[0x29]; block
[0x21] = temp_block
[0x22];
296 if(last_non_zero_p1
<= 24) goto end
;
297 block
[0x1D] = temp_block
[0x1B]; block
[0x12] = temp_block
[0x14];
298 block
[0x0E] = temp_block
[0x0D]; block
[0x03] = temp_block
[0x06];
299 block
[0x07] = temp_block
[0x07]; block
[0x0B] = temp_block
[0x0E];
300 block
[0x16] = temp_block
[0x15]; block
[0x1A] = temp_block
[0x1C];
301 if(last_non_zero_p1
<= 32) goto end
;
302 block
[0x25] = temp_block
[0x23]; block
[0x29] = temp_block
[0x2A];
303 block
[0x34] = temp_block
[0x31]; block
[0x38] = temp_block
[0x38];
304 block
[0x3C] = temp_block
[0x39]; block
[0x31] = temp_block
[0x32];
305 block
[0x2D] = temp_block
[0x2B]; block
[0x22] = temp_block
[0x24];
306 if(last_non_zero_p1
<= 40) goto end
;
307 block
[0x1E] = temp_block
[0x1D]; block
[0x13] = temp_block
[0x16];
308 block
[0x0F] = temp_block
[0x0F]; block
[0x17] = temp_block
[0x17];
309 block
[0x1B] = temp_block
[0x1E]; block
[0x26] = temp_block
[0x25];
310 block
[0x2A] = temp_block
[0x2C]; block
[0x35] = temp_block
[0x33];
311 if(last_non_zero_p1
<= 48) goto end
;
312 block
[0x39] = temp_block
[0x3A]; block
[0x3D] = temp_block
[0x3B];
313 block
[0x32] = temp_block
[0x34]; block
[0x2E] = temp_block
[0x2D];
314 block
[0x23] = temp_block
[0x26]; block
[0x1F] = temp_block
[0x1F];
315 block
[0x27] = temp_block
[0x27]; block
[0x2B] = temp_block
[0x2E];
316 if(last_non_zero_p1
<= 56) goto end
;
317 block
[0x36] = temp_block
[0x35]; block
[0x3A] = temp_block
[0x3C];
318 block
[0x3E] = temp_block
[0x3D]; block
[0x33] = temp_block
[0x36];
319 block
[0x2F] = temp_block
[0x2F]; block
[0x37] = temp_block
[0x37];
320 block
[0x3B] = temp_block
[0x3E]; block
[0x3F] = temp_block
[0x3F];
322 if(last_non_zero_p1
<= 1) goto end
;
323 block
[0x01] = temp_block
[0x01];
324 block
[0x08] = temp_block
[0x08]; block
[0x10] = temp_block
[0x10];
325 if(last_non_zero_p1
<= 4) goto end
;
326 block
[0x09] = temp_block
[0x09]; block
[0x02] = temp_block
[0x02];
327 block
[0x03] = temp_block
[0x03];
328 if(last_non_zero_p1
<= 7) goto end
;
329 block
[0x0A] = temp_block
[0x0A]; block
[0x11] = temp_block
[0x11];
330 block
[0x18] = temp_block
[0x18]; block
[0x20] = temp_block
[0x20];
331 if(last_non_zero_p1
<= 11) goto end
;
332 block
[0x19] = temp_block
[0x19];
333 block
[0x12] = temp_block
[0x12]; block
[0x0B] = temp_block
[0x0B];
334 block
[0x04] = temp_block
[0x04]; block
[0x05] = temp_block
[0x05];
335 if(last_non_zero_p1
<= 16) goto end
;
336 block
[0x0C] = temp_block
[0x0C]; block
[0x13] = temp_block
[0x13];
337 block
[0x1A] = temp_block
[0x1A]; block
[0x21] = temp_block
[0x21];
338 block
[0x28] = temp_block
[0x28]; block
[0x30] = temp_block
[0x30];
339 block
[0x29] = temp_block
[0x29]; block
[0x22] = temp_block
[0x22];
340 if(last_non_zero_p1
<= 24) goto end
;
341 block
[0x1B] = temp_block
[0x1B]; block
[0x14] = temp_block
[0x14];
342 block
[0x0D] = temp_block
[0x0D]; block
[0x06] = temp_block
[0x06];
343 block
[0x07] = temp_block
[0x07]; block
[0x0E] = temp_block
[0x0E];
344 block
[0x15] = temp_block
[0x15]; block
[0x1C] = temp_block
[0x1C];
345 if(last_non_zero_p1
<= 32) goto end
;
346 block
[0x23] = temp_block
[0x23]; block
[0x2A] = temp_block
[0x2A];
347 block
[0x31] = temp_block
[0x31]; block
[0x38] = temp_block
[0x38];
348 block
[0x39] = temp_block
[0x39]; block
[0x32] = temp_block
[0x32];
349 block
[0x2B] = temp_block
[0x2B]; block
[0x24] = temp_block
[0x24];
350 if(last_non_zero_p1
<= 40) goto end
;
351 block
[0x1D] = temp_block
[0x1D]; block
[0x16] = temp_block
[0x16];
352 block
[0x0F] = temp_block
[0x0F]; block
[0x17] = temp_block
[0x17];
353 block
[0x1E] = temp_block
[0x1E]; block
[0x25] = temp_block
[0x25];
354 block
[0x2C] = temp_block
[0x2C]; block
[0x33] = temp_block
[0x33];
355 if(last_non_zero_p1
<= 48) goto end
;
356 block
[0x3A] = temp_block
[0x3A]; block
[0x3B] = temp_block
[0x3B];
357 block
[0x34] = temp_block
[0x34]; block
[0x2D] = temp_block
[0x2D];
358 block
[0x26] = temp_block
[0x26]; block
[0x1F] = temp_block
[0x1F];
359 block
[0x27] = temp_block
[0x27]; block
[0x2E] = temp_block
[0x2E];
360 if(last_non_zero_p1
<= 56) goto end
;
361 block
[0x35] = temp_block
[0x35]; block
[0x3C] = temp_block
[0x3C];
362 block
[0x3D] = temp_block
[0x3D]; block
[0x36] = temp_block
[0x36];
363 block
[0x2F] = temp_block
[0x2F]; block
[0x37] = temp_block
[0x37];
364 block
[0x3E] = temp_block
[0x3E]; block
[0x3F] = temp_block
[0x3F];
368 for(i=0; i<last_non_zero_p1; i++)
370 int j= zigzag_direct_noperm[i];
371 block[block_permute_op(j)]= temp_block[j];
375 return last_non_zero_p1
- 1;