2 * MPEG video MMX templates
4 * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 #define SPREADW(a) "pshufw $0, " #a ", " #a " \n\t"
24 #define PMAXW(a,b) "pmaxsw " #a ", " #b " \n\t"
28 "punpcklwd " #a ", " #a " \n\t"\
29 "punpcklwd " #a ", " #a " \n\t"
31 "psubusw " #a ", " #b " \n\t"\
32 "paddw " #a ", " #b " \n\t"
35 static int RENAME(dct_quantize
)(MpegEncContext
*s
,
36 DCTELEM
*block
, int n
,
37 int qscale
, int *overflow
)
39 long last_non_zero_p1
;
40 int level
=0, q
; //=0 is cuz gcc says uninitalized ...
41 const uint16_t *qmat
, *bias
;
42 __align8
int16_t temp_block
[64];
44 assert((7&(int)(&temp_block
[0])) == 0); //did gcc align it correctly?
47 RENAMEl(ff_fdct
) (block
); //cant be anything else ...
50 s
->denoise_dct(s
, block
);
58 /* note: block[0] is assumed to be positive */
63 : "=d" (level
), "=a"(dummy
)
64 : "a" ((block
[0]>>2) + q
), "c" (inverse
[q
<<1])
68 "xorl %%edx, %%edx \n\t"
70 "movzwl %%ax, %%eax \n\t"
72 : "a" ((block
[0]>>2) + q
), "c" (q
<<1)
77 /* For AIC we skip quant/dequant of INTRADC */
78 level
= (block
[0] + 4)>>3;
80 block
[0]=0; //avoid fake overflow
81 // temp_block[0] = (block[0] + (q >> 1)) / q;
83 bias
= s
->q_intra_matrix16
[qscale
][1];
84 qmat
= s
->q_intra_matrix16
[qscale
][0];
87 bias
= s
->q_inter_matrix16
[qscale
][1];
88 qmat
= s
->q_inter_matrix16
[qscale
][0];
91 if((s
->out_format
== FMT_H263
|| s
->out_format
== FMT_H261
) && s
->mpeg_quant
==0){
94 "movd %%"REG_a
", %%mm3 \n\t" // last_non_zero_p1
96 "pxor %%mm7, %%mm7 \n\t" // 0
97 "pxor %%mm4, %%mm4 \n\t" // 0
98 "movq (%2), %%mm5 \n\t" // qmat[0]
99 "pxor %%mm6, %%mm6 \n\t"
100 "psubw (%3), %%mm6 \n\t" // -bias[0]
101 "mov $-128, %%"REG_a
" \n\t"
104 "pxor %%mm1, %%mm1 \n\t" // 0
105 "movq (%1, %%"REG_a
"), %%mm0 \n\t" // block[i]
106 "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00
107 "pxor %%mm1, %%mm0 \n\t"
108 "psubw %%mm1, %%mm0 \n\t" // ABS(block[i])
109 "psubusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0]
110 "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
111 "por %%mm0, %%mm4 \n\t"
112 "pxor %%mm1, %%mm0 \n\t"
113 "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
114 "movq %%mm0, (%5, %%"REG_a
") \n\t"
115 "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00
116 "movq (%4, %%"REG_a
"), %%mm1 \n\t"
117 "movq %%mm7, (%1, %%"REG_a
") \n\t" // 0
118 "pandn %%mm1, %%mm0 \n\t"
120 "add $8, %%"REG_a
" \n\t"
122 "movq %%mm3, %%mm0 \n\t"
123 "psrlq $32, %%mm3 \n\t"
125 "movq %%mm3, %%mm0 \n\t"
126 "psrlq $16, %%mm3 \n\t"
128 "movd %%mm3, %%"REG_a
" \n\t"
129 "movzb %%al, %%"REG_a
" \n\t" // last_non_zero_p1
130 : "+a" (last_non_zero_p1
)
131 : "r" (block
+64), "r" (qmat
), "r" (bias
),
132 "r" (inv_zigzag_direct16
+64), "r" (temp_block
+64)
134 // note the asm is split cuz gcc doesnt like that many operands ...
136 "movd %1, %%mm1 \n\t" // max_qcoeff
138 "psubusw %%mm1, %%mm4 \n\t"
139 "packuswb %%mm4, %%mm4 \n\t"
140 "movd %%mm4, %0 \n\t" // *overflow
142 : "g" (s
->max_qcoeff
)
146 "movd %%"REG_a
", %%mm3 \n\t" // last_non_zero_p1
148 "pxor %%mm7, %%mm7 \n\t" // 0
149 "pxor %%mm4, %%mm4 \n\t" // 0
150 "mov $-128, %%"REG_a
" \n\t"
153 "pxor %%mm1, %%mm1 \n\t" // 0
154 "movq (%1, %%"REG_a
"), %%mm0 \n\t" // block[i]
155 "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00
156 "pxor %%mm1, %%mm0 \n\t"
157 "psubw %%mm1, %%mm0 \n\t" // ABS(block[i])
158 "movq (%3, %%"REG_a
"), %%mm6 \n\t" // bias[0]
159 "paddusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0]
160 "movq (%2, %%"REG_a
"), %%mm5 \n\t" // qmat[i]
161 "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
162 "por %%mm0, %%mm4 \n\t"
163 "pxor %%mm1, %%mm0 \n\t"
164 "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
165 "movq %%mm0, (%5, %%"REG_a
") \n\t"
166 "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00
167 "movq (%4, %%"REG_a
"), %%mm1 \n\t"
168 "movq %%mm7, (%1, %%"REG_a
") \n\t" // 0
169 "pandn %%mm1, %%mm0 \n\t"
171 "add $8, %%"REG_a
" \n\t"
173 "movq %%mm3, %%mm0 \n\t"
174 "psrlq $32, %%mm3 \n\t"
176 "movq %%mm3, %%mm0 \n\t"
177 "psrlq $16, %%mm3 \n\t"
179 "movd %%mm3, %%"REG_a
" \n\t"
180 "movzb %%al, %%"REG_a
" \n\t" // last_non_zero_p1
181 : "+a" (last_non_zero_p1
)
182 : "r" (block
+64), "r" (qmat
+64), "r" (bias
+64),
183 "r" (inv_zigzag_direct16
+64), "r" (temp_block
+64)
185 // note the asm is split cuz gcc doesnt like that many operands ...
187 "movd %1, %%mm1 \n\t" // max_qcoeff
189 "psubusw %%mm1, %%mm4 \n\t"
190 "packuswb %%mm4, %%mm4 \n\t"
191 "movd %%mm4, %0 \n\t" // *overflow
193 : "g" (s
->max_qcoeff
)
197 if(s
->mb_intra
) block
[0]= level
;
198 else block
[0]= temp_block
[0];
200 if(s
->dsp
.idct_permutation_type
== FF_SIMPLE_IDCT_PERM
){
201 if(last_non_zero_p1
<= 1) goto end
;
202 block
[0x08] = temp_block
[0x01]; block
[0x10] = temp_block
[0x08];
203 block
[0x20] = temp_block
[0x10];
204 if(last_non_zero_p1
<= 4) goto end
;
205 block
[0x18] = temp_block
[0x09]; block
[0x04] = temp_block
[0x02];
206 block
[0x09] = temp_block
[0x03];
207 if(last_non_zero_p1
<= 7) goto end
;
208 block
[0x14] = temp_block
[0x0A]; block
[0x28] = temp_block
[0x11];
209 block
[0x12] = temp_block
[0x18]; block
[0x02] = temp_block
[0x20];
210 if(last_non_zero_p1
<= 11) goto end
;
211 block
[0x1A] = temp_block
[0x19]; block
[0x24] = temp_block
[0x12];
212 block
[0x19] = temp_block
[0x0B]; block
[0x01] = temp_block
[0x04];
213 block
[0x0C] = temp_block
[0x05];
214 if(last_non_zero_p1
<= 16) goto end
;
215 block
[0x11] = temp_block
[0x0C]; block
[0x29] = temp_block
[0x13];
216 block
[0x16] = temp_block
[0x1A]; block
[0x0A] = temp_block
[0x21];
217 block
[0x30] = temp_block
[0x28]; block
[0x22] = temp_block
[0x30];
218 block
[0x38] = temp_block
[0x29]; block
[0x06] = temp_block
[0x22];
219 if(last_non_zero_p1
<= 24) goto end
;
220 block
[0x1B] = temp_block
[0x1B]; block
[0x21] = temp_block
[0x14];
221 block
[0x1C] = temp_block
[0x0D]; block
[0x05] = temp_block
[0x06];
222 block
[0x0D] = temp_block
[0x07]; block
[0x15] = temp_block
[0x0E];
223 block
[0x2C] = temp_block
[0x15]; block
[0x13] = temp_block
[0x1C];
224 if(last_non_zero_p1
<= 32) goto end
;
225 block
[0x0B] = temp_block
[0x23]; block
[0x34] = temp_block
[0x2A];
226 block
[0x2A] = temp_block
[0x31]; block
[0x32] = temp_block
[0x38];
227 block
[0x3A] = temp_block
[0x39]; block
[0x26] = temp_block
[0x32];
228 block
[0x39] = temp_block
[0x2B]; block
[0x03] = temp_block
[0x24];
229 if(last_non_zero_p1
<= 40) goto end
;
230 block
[0x1E] = temp_block
[0x1D]; block
[0x25] = temp_block
[0x16];
231 block
[0x1D] = temp_block
[0x0F]; block
[0x2D] = temp_block
[0x17];
232 block
[0x17] = temp_block
[0x1E]; block
[0x0E] = temp_block
[0x25];
233 block
[0x31] = temp_block
[0x2C]; block
[0x2B] = temp_block
[0x33];
234 if(last_non_zero_p1
<= 48) goto end
;
235 block
[0x36] = temp_block
[0x3A]; block
[0x3B] = temp_block
[0x3B];
236 block
[0x23] = temp_block
[0x34]; block
[0x3C] = temp_block
[0x2D];
237 block
[0x07] = temp_block
[0x26]; block
[0x1F] = temp_block
[0x1F];
238 block
[0x0F] = temp_block
[0x27]; block
[0x35] = temp_block
[0x2E];
239 if(last_non_zero_p1
<= 56) goto end
;
240 block
[0x2E] = temp_block
[0x35]; block
[0x33] = temp_block
[0x3C];
241 block
[0x3E] = temp_block
[0x3D]; block
[0x27] = temp_block
[0x36];
242 block
[0x3D] = temp_block
[0x2F]; block
[0x2F] = temp_block
[0x37];
243 block
[0x37] = temp_block
[0x3E]; block
[0x3F] = temp_block
[0x3F];
244 }else if(s
->dsp
.idct_permutation_type
== FF_LIBMPEG2_IDCT_PERM
){
245 if(last_non_zero_p1
<= 1) goto end
;
246 block
[0x04] = temp_block
[0x01];
247 block
[0x08] = temp_block
[0x08]; block
[0x10] = temp_block
[0x10];
248 if(last_non_zero_p1
<= 4) goto end
;
249 block
[0x0C] = temp_block
[0x09]; block
[0x01] = temp_block
[0x02];
250 block
[0x05] = temp_block
[0x03];
251 if(last_non_zero_p1
<= 7) goto end
;
252 block
[0x09] = temp_block
[0x0A]; block
[0x14] = temp_block
[0x11];
253 block
[0x18] = temp_block
[0x18]; block
[0x20] = temp_block
[0x20];
254 if(last_non_zero_p1
<= 11) goto end
;
255 block
[0x1C] = temp_block
[0x19];
256 block
[0x11] = temp_block
[0x12]; block
[0x0D] = temp_block
[0x0B];
257 block
[0x02] = temp_block
[0x04]; block
[0x06] = temp_block
[0x05];
258 if(last_non_zero_p1
<= 16) goto end
;
259 block
[0x0A] = temp_block
[0x0C]; block
[0x15] = temp_block
[0x13];
260 block
[0x19] = temp_block
[0x1A]; block
[0x24] = temp_block
[0x21];
261 block
[0x28] = temp_block
[0x28]; block
[0x30] = temp_block
[0x30];
262 block
[0x2C] = temp_block
[0x29]; block
[0x21] = temp_block
[0x22];
263 if(last_non_zero_p1
<= 24) goto end
;
264 block
[0x1D] = temp_block
[0x1B]; block
[0x12] = temp_block
[0x14];
265 block
[0x0E] = temp_block
[0x0D]; block
[0x03] = temp_block
[0x06];
266 block
[0x07] = temp_block
[0x07]; block
[0x0B] = temp_block
[0x0E];
267 block
[0x16] = temp_block
[0x15]; block
[0x1A] = temp_block
[0x1C];
268 if(last_non_zero_p1
<= 32) goto end
;
269 block
[0x25] = temp_block
[0x23]; block
[0x29] = temp_block
[0x2A];
270 block
[0x34] = temp_block
[0x31]; block
[0x38] = temp_block
[0x38];
271 block
[0x3C] = temp_block
[0x39]; block
[0x31] = temp_block
[0x32];
272 block
[0x2D] = temp_block
[0x2B]; block
[0x22] = temp_block
[0x24];
273 if(last_non_zero_p1
<= 40) goto end
;
274 block
[0x1E] = temp_block
[0x1D]; block
[0x13] = temp_block
[0x16];
275 block
[0x0F] = temp_block
[0x0F]; block
[0x17] = temp_block
[0x17];
276 block
[0x1B] = temp_block
[0x1E]; block
[0x26] = temp_block
[0x25];
277 block
[0x2A] = temp_block
[0x2C]; block
[0x35] = temp_block
[0x33];
278 if(last_non_zero_p1
<= 48) goto end
;
279 block
[0x39] = temp_block
[0x3A]; block
[0x3D] = temp_block
[0x3B];
280 block
[0x32] = temp_block
[0x34]; block
[0x2E] = temp_block
[0x2D];
281 block
[0x23] = temp_block
[0x26]; block
[0x1F] = temp_block
[0x1F];
282 block
[0x27] = temp_block
[0x27]; block
[0x2B] = temp_block
[0x2E];
283 if(last_non_zero_p1
<= 56) goto end
;
284 block
[0x36] = temp_block
[0x35]; block
[0x3A] = temp_block
[0x3C];
285 block
[0x3E] = temp_block
[0x3D]; block
[0x33] = temp_block
[0x36];
286 block
[0x2F] = temp_block
[0x2F]; block
[0x37] = temp_block
[0x37];
287 block
[0x3B] = temp_block
[0x3E]; block
[0x3F] = temp_block
[0x3F];
289 if(last_non_zero_p1
<= 1) goto end
;
290 block
[0x01] = temp_block
[0x01];
291 block
[0x08] = temp_block
[0x08]; block
[0x10] = temp_block
[0x10];
292 if(last_non_zero_p1
<= 4) goto end
;
293 block
[0x09] = temp_block
[0x09]; block
[0x02] = temp_block
[0x02];
294 block
[0x03] = temp_block
[0x03];
295 if(last_non_zero_p1
<= 7) goto end
;
296 block
[0x0A] = temp_block
[0x0A]; block
[0x11] = temp_block
[0x11];
297 block
[0x18] = temp_block
[0x18]; block
[0x20] = temp_block
[0x20];
298 if(last_non_zero_p1
<= 11) goto end
;
299 block
[0x19] = temp_block
[0x19];
300 block
[0x12] = temp_block
[0x12]; block
[0x0B] = temp_block
[0x0B];
301 block
[0x04] = temp_block
[0x04]; block
[0x05] = temp_block
[0x05];
302 if(last_non_zero_p1
<= 16) goto end
;
303 block
[0x0C] = temp_block
[0x0C]; block
[0x13] = temp_block
[0x13];
304 block
[0x1A] = temp_block
[0x1A]; block
[0x21] = temp_block
[0x21];
305 block
[0x28] = temp_block
[0x28]; block
[0x30] = temp_block
[0x30];
306 block
[0x29] = temp_block
[0x29]; block
[0x22] = temp_block
[0x22];
307 if(last_non_zero_p1
<= 24) goto end
;
308 block
[0x1B] = temp_block
[0x1B]; block
[0x14] = temp_block
[0x14];
309 block
[0x0D] = temp_block
[0x0D]; block
[0x06] = temp_block
[0x06];
310 block
[0x07] = temp_block
[0x07]; block
[0x0E] = temp_block
[0x0E];
311 block
[0x15] = temp_block
[0x15]; block
[0x1C] = temp_block
[0x1C];
312 if(last_non_zero_p1
<= 32) goto end
;
313 block
[0x23] = temp_block
[0x23]; block
[0x2A] = temp_block
[0x2A];
314 block
[0x31] = temp_block
[0x31]; block
[0x38] = temp_block
[0x38];
315 block
[0x39] = temp_block
[0x39]; block
[0x32] = temp_block
[0x32];
316 block
[0x2B] = temp_block
[0x2B]; block
[0x24] = temp_block
[0x24];
317 if(last_non_zero_p1
<= 40) goto end
;
318 block
[0x1D] = temp_block
[0x1D]; block
[0x16] = temp_block
[0x16];
319 block
[0x0F] = temp_block
[0x0F]; block
[0x17] = temp_block
[0x17];
320 block
[0x1E] = temp_block
[0x1E]; block
[0x25] = temp_block
[0x25];
321 block
[0x2C] = temp_block
[0x2C]; block
[0x33] = temp_block
[0x33];
322 if(last_non_zero_p1
<= 48) goto end
;
323 block
[0x3A] = temp_block
[0x3A]; block
[0x3B] = temp_block
[0x3B];
324 block
[0x34] = temp_block
[0x34]; block
[0x2D] = temp_block
[0x2D];
325 block
[0x26] = temp_block
[0x26]; block
[0x1F] = temp_block
[0x1F];
326 block
[0x27] = temp_block
[0x27]; block
[0x2E] = temp_block
[0x2E];
327 if(last_non_zero_p1
<= 56) goto end
;
328 block
[0x35] = temp_block
[0x35]; block
[0x3C] = temp_block
[0x3C];
329 block
[0x3D] = temp_block
[0x3D]; block
[0x36] = temp_block
[0x36];
330 block
[0x2F] = temp_block
[0x2F]; block
[0x37] = temp_block
[0x37];
331 block
[0x3E] = temp_block
[0x3E]; block
[0x3F] = temp_block
[0x3F];
335 for(i=0; i<last_non_zero_p1; i++)
337 int j= zigzag_direct_noperm[i];
338 block[block_permute_op(j)]= temp_block[j];
342 return last_non_zero_p1
- 1;