Merge branch 'ct' of git.pipapo.org:cinelerra-ct into ct
[cinelerra_cv/ct.git] / mpeg2enc / quant_mmx.s
blobc206918e63b063f34a7261ad738b83f5210d5455
2 ; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
5 ; This program is free software; you can redistribute it and/or
6 ; modify it under the terms of the GNU General Public License
7 ; as published by the Free Software Foundation; either version 2
8 ; of the License, or (at your option) any later version.
10 ; This program is distributed in the hope that it will be useful,
11 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ; GNU General Public License for more details.
15 ; You should have received a copy of the GNU General Public License
16 ; along with this program; if not, write to the Free Software
17 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ; quantize_ni_mmx.s: MMX optimized coefficient quantization sub-routine
24 global quantize_ni_mmx
25 ; int quantize_ni_mmx(short *dst, short *src,
26 ; short *quant_mat, short *i_quant_mat,
27 ; int imquant, int mquant, int sat_limit)
29 ; See quantize.c: quant_non_intra_hv_inv() for reference implementation in C...
30 ;; mquant is not currently used.
31 ; eax = row counter...
32 ; ebx = pqm
33 ; ecx = piqm ; Matrix of quads first (2^16/quant)
34 ; then (2^16/quant)*(2^16%quant) the second part is for rounding
35 ; edx = temp
36 ; edi = psrc
37 ; esi = pdst
39 ; mm0 = [imquant|0..3]W
40 ; mm1 = [sat_limit|0..3]W
41 ; mm2 = *psrc -> src
42 ; mm3 = rounding corrections... / temp
43 ; mm4 = sign
44 ; mm5 = nzflag accumulators
45 ; mm6 = overflow limit
46 ; mm7 = temp
48 ;;
49 ;; private constants needed
50 ;;
52 SECTION .data
53 align 16
54 overflim:
55 dw 1024-1
56 dw 1024-1
57 dw 1024-1
58 dw 1024-1
60 ;; BUFFER NO LONGER USED DUE TO IMPROVED MAIN ROUTINE...
61 SECTION .bss
62 align 32
63 quant_buf: resw 64
65 SECTION .text
68 align 32
69 quantize_ni_mmx:
70 push ebp ; save frame pointer
71 mov ebp, esp ; link
72 push ebx
73 push ecx
74 push edx
75 push esi
76 push edi
78 mov edi, [ebp+8] ; get dst
79 mov esi, [ebp+12] ; get psrc
80 mov ebx, [ebp+16] ; get pqm
81 mov ecx, [ebp+20] ; get piqm
82 movd mm0, [ebp+24] ; get imquant (2^16 / mquant )
83 movq mm1, mm0
84 punpcklwd mm0, mm1
85 punpcklwd mm0, mm0 ; mm0 = [imquant|0..3]W
87 movq mm6, [overflim]; overflow limit
89 movd mm1, [ebp+32] ; sat_limit
90 movq mm2, mm1
91 punpcklwd mm1, mm2 ; [sat_limit|0..3]W
92 punpcklwd mm1, mm1 ; mm1 = [sat_limit|0..3]W
94 pxor mm5, mm5 ; Non-zero flag accumulator
95 mov eax, 16 ; 16 quads to do
96 jmp nextquadniq
98 align 32
99 nextquadniq:
100 movq mm2, [esi] ; mm0 = *psrc
102 pxor mm4, mm4
103 pcmpgtw mm4, mm2 ; mm4 = *psrc < 0
104 movq mm7, mm2 ; mm7 = *psrc
105 psllw mm7, 1 ; mm7 = 2*(*psrc)
106 pand mm7, mm4 ; mm7 = 2*(*psrc)*(*psrc < 0)
107 psubw mm2, mm7 ; mm2 = abs(*psrc)
110 ;; Check whether we'll saturate intermediate results
111 ;; Eventually flag is low 8 bits of result
114 movq mm7, mm2
115 pcmpgtw mm7, mm6 ; Tooo big for 16 bit arithmetic :-( (should be *very* rare)
116 movq mm3, mm7
117 psrlq mm3, 32
118 por mm7, mm3
119 movd edx, mm7
120 cmp edx, 0
121 jnz near out_of_range
124 ;; Carry on with the arithmetic...
125 psllw mm2, 5 ; mm2 = 32*abs(*psrc)
126 movq mm7, [ebx] ; mm7 = *pqm>>1
127 psrlw mm7, 1
128 paddw mm2, mm7 ; mm2 = 32*abs(*psrc)+((*pqm)/2) = "p"
132 ;; Do the first multiplication. Cunningly we've set things up so
133 ;; it is exactly the top 16 bits we're interested in...
135 ;; We need the low word results for a rounding correction.
136 ;; This is *not* exact (that actual
137 ;; correction the product abs(*psrc)*(*pqm)*(2^16%*qm) >> 16
138 ;; However we get very very few wrong and none too low (the most
139 ;; important) and no errors for small coefficients (also important)
140 ;; if we simply add abs(*psrc)
143 movq mm3, mm2
144 pmullw mm3, [ecx]
145 movq mm7, mm2
146 psrlw mm7, 1 ; Want to see if adding p would carry into upper 16 bits
147 psrlw mm3, 1
148 paddw mm3, mm7
149 psrlw mm3, 15 ; High bit in lsb rest 0's
150 pmulhw mm2, [ecx] ; mm2 = (p*iqm+p) >> IQUANT_SCALE_POW2 ~= p/*qm
155 ;; To hide the latency lets update some pointers...
156 add esi, 8 ; 4 word's
157 add ecx, 8 ; 4 word's
158 sub eax, 1
160 ;; Add rounding correction....
161 paddw mm2, mm3
165 ;; Do the second multiplication, again we ned to make a rounding adjustment
166 ;; EXPERIMENT: see comments in quantize.c:quant_non_intra_hv don't adjust...
167 ; movq mm3, mm2
168 ; pmullw mm3, mm0
169 ; movq mm7, mm2
170 ; psrlw mm7, 1 ; Want to see if adding p would carry into upper 16 bits
171 ; psrlw mm3, 1
172 ; paddw mm3, mm7
173 ; psrlw mm3, 15 ; High bit in lsb rest 0's
175 pmulhw mm2, mm0 ; mm2 ~= (p/(qm*mquant))
178 ;; To hide the latency lets update some more pointers...
179 add edi, 8
180 add ebx, 8
182 ;; Correct rounding and the factor of two (we want p/(qm*2*mquant)
183 ; paddw mm2, mm3
184 psrlw mm2, 1
188 ;; Check for saturation
190 movq mm7, mm2
191 pcmpgtw mm7, mm1
192 movq mm3, mm7
193 psrlq mm3, 32
194 movq mm3, mm7
195 por mm7, mm3
196 movd edx, mm7
197 cmp edx, 0
198 jnz saturated
201 ;; Accumulate non-zero flags
202 por mm5, mm2
205 ;; Now correct the sign mm4 = *psrc < 0
208 pxor mm7, mm7 ; mm7 = -2*mm2
209 psubw mm7, mm2
210 psllw mm7, 1
211 pand mm7, mm4 ; mm7 = -2*mm2 * (*psrc < 0)
212 paddw mm2, mm7 ; mm7 = samesign(*psrc, mm2 )
215 ;; Store the quantised words....
218 movq [edi-8], mm2
219 test eax, eax
221 jnz near nextquadniq
223 ;; Return saturation in low word and nzflag in high word of result dword
226 movq mm0, mm5
227 psrlq mm0, 32
228 por mm5, mm0
229 movd edx, mm5
230 mov ebx, edx
231 shl ebx, 16
232 or edx, ebx
233 and edx, 0xffff0000 ;; hiwgh word ecx is nzflag
234 mov eax, edx
236 return:
237 pop edi
238 pop esi
239 pop edx
240 pop ecx
241 pop ebx
243 pop ebp ; restore stack pointer
245 emms ; clear mmx registers
246 ret
248 out_of_range:
249 mov eax, 0x00ff
250 jp return
251 saturated:
253 mov eax, 0xff00
254 jp return
259 ;;;
260 ;;; void iquant_non_intra_m1_{sse,mmx}(int16_t *src, int16_t *dst, uint16_t
261 ;;; *quant_mat)
262 ;;; mmx/sse Inverse mpeg-1 quantisation routine.
263 ;;;
264 ;;; eax - block counter...
265 ;;; edi - src
266 ;;; esi - dst
267 ;;; edx - quant_mat
269 ;; MMX Register usage
270 ;; mm7 = [1|0..3]W
271 ;; mm6 = [2047|0..3]W
272 ;; mm5 = 0
275 global iquant_non_intra_m1_sse
276 align 32
277 iquant_non_intra_m1_sse:
279 push ebp ; save frame pointer
280 mov ebp, esp ; link
282 push eax
283 push esi
284 push edi
285 push edx
287 mov edi, [ebp+8] ; get psrc
288 mov esi, [ebp+12] ; get pdst
289 mov edx, [ebp+16] ; get quant table
290 mov eax,1
291 movd mm7, eax
292 punpcklwd mm7, mm7
293 punpckldq mm7, mm7
295 mov eax, 2047
296 movd mm6, eax
297 punpcklwd mm6, mm6
298 punpckldq mm6, mm6
300 mov eax, 64 ; 64 coeffs in a DCT block
301 pxor mm5, mm5
303 iquant_loop_sse:
304 movq mm0, [edi] ; mm0 = *psrc
305 add edi,8
306 pxor mm1,mm1
307 movq mm2, mm0
308 pcmpeqw mm2, mm1 ; mm2 = 1's for non-zero in mm0
309 pcmpeqw mm2, mm1
311 ;; Work with absolute value for convience...
312 psubw mm1, mm0 ; mm1 = -*psrc
313 pmaxsw mm1, mm0 ; mm1 = val = max(*psrc,-*psrc) = abs(*psrc)
314 paddw mm1, mm1 ; mm1 *= 2;
315 paddw mm1, mm7 ; mm1 += 1
316 pmullw mm1, [edx] ; mm1 = (val*2+1) * *quant_mat
317 add edx, 8
318 psraw mm1, 5 ; mm1 = ((val*2+1) * *quant_mat)/32
320 ;; Now that nasty mis-match control
322 movq mm3, mm1
323 pand mm3, mm7
324 pxor mm3, mm7 ; mm3 = ~(val&1) (in the low bits, others 0)
325 movq mm4, mm1
326 pcmpeqw mm4, mm5 ; mm4 = (val == 0)
327 pxor mm4, mm7 ; Low bits now (val != 0)
328 pand mm3, mm4 ; mm3 = (~(val&1))&(val!=0)
330 psubw mm1, mm3 ; mm1 -= (~(val&1))&(val!=0)
331 pminsw mm1, mm6 ; mm1 = saturated(res)
333 ;; Handle zero case and restoring sign
334 pand mm1, mm2 ; Zero in the zero case
335 pxor mm3, mm3
336 psubw mm3, mm1 ; mm3 = - res
337 paddw mm3, mm3 ; mm3 = - 2*res
338 pcmpgtw mm0, mm5 ; mm0 = *psrc < 0
339 pcmpeqw mm0, mm5 ; mm0 = *psrc >= 0
340 pand mm3, mm0 ; mm3 = *psrc <= 0 ? -2 * res : 0
341 paddw mm1, mm3 ; mm3 = samesign(*psrc,res)
342 movq [esi], mm1
343 add esi,8
345 sub eax, 4
346 jnz iquant_loop_sse
348 pop edx
349 pop edi
350 pop esi
351 pop eax
353 pop ebp ; restore stack pointer
355 emms ; clear mmx registers
356 ret
359 ;;;
360 ;;; void iquant_non_intra_m1_mmx(int16_t *src, int16_t *dst, uint16_t
361 ;;; *quant_mat)
362 ;;; eax - block counter...
363 ;;; edi - src
364 ;;; esi - dst
365 ;;; edx - quant_mat
367 ;; MMX Register usage
368 ;; mm7 = [1|0..3]W
369 ;; mm6 = [MAX_UINT16-2047|0..3]W
370 ;; mm5 = 0
373 global iquant_non_intra_m1_mmx
374 align 32
375 iquant_non_intra_m1_mmx:
377 push ebp ; save frame pointer
378 mov ebp, esp ; link
380 push eax
381 push esi
382 push edi
383 push edx
385 mov edi, [ebp+8] ; get psrc
386 mov esi, [ebp+12] ; get pdst
387 mov edx, [ebp+16] ; get quant table
388 mov eax,1
389 movd mm7, eax
390 punpcklwd mm7, mm7
391 punpckldq mm7, mm7
393 mov eax, (0xffff-2047)
394 movd mm6, eax
395 punpcklwd mm6, mm6
396 punpckldq mm6, mm6
398 mov eax, 64 ; 64 coeffs in a DCT block
399 pxor mm5, mm5
401 iquant_loop:
402 movq mm0, [edi] ; mm0 = *psrc
403 add edi,8
404 pxor mm1, mm1
405 movq mm2, mm0
406 pcmpeqw mm2, mm5 ; mm2 = 1's for non-zero in mm0
407 pcmpeqw mm2, mm5
409 ;; Work with absolute value for convience...
411 psubw mm1, mm0 ; mm1 = -*psrc
412 psllw mm1, 1 ; mm1 = -2*psrc
413 movq mm3, mm0 ; mm3 = *psrc > 0
414 pcmpgtw mm3, mm5
415 pcmpeqw mm3, mm5 ; mm3 = *psrc <= 0
416 pand mm3, mm1 ; mm3 = (*psrc <= 0)*-2* *psrc
417 movq mm1, mm0 ; mm1 = (*psrc <= 0)*-2* *psrc + *psrc = abs(*psrc)
418 paddw mm1, mm3
421 paddw mm1, mm1 ; mm1 *= 2;
422 paddw mm1, mm7 ; mm1 += 1
423 pmullw mm1, [edx] ; mm1 = (val*2+1) * *quant_mat
424 add edx, 8
425 psraw mm1, 5 ; mm1 = ((val*2+1) * *quant_mat)/32
427 ;; Now that nasty mis-match control
429 movq mm3, mm1
430 pand mm3, mm7
431 pxor mm3, mm7 ; mm3 = ~(val&1) (in the low bits, others 0)
432 movq mm4, mm1
433 pcmpeqw mm4, mm5 ; mm4 = (val == 0)
434 pxor mm4, mm7 ; Low bits now (val != 0)
435 pand mm3, mm4 ; mm3 = (~(val&1))&(val!=0)
437 psubw mm1, mm3 ; mm1 -= (~(val&1))&(val!=0)
439 paddsw mm1, mm6 ; Will saturate if > 2047
440 psubw mm1, mm6 ; 2047 if saturated... unchanged otherwise
442 ;; Handle zero case and restoring sign
443 pand mm1, mm2 ; Zero in the zero case
444 pxor mm3, mm3
445 psubw mm3, mm1 ; mm3 = - res
446 paddw mm3, mm3 ; mm3 = - 2*res
447 pcmpgtw mm0, mm5 ; mm0 = *psrc < 0
448 pcmpeqw mm0, mm5 ; mm0 = *psrc >= 0
449 pand mm3, mm0 ; mm3 = *psrc <= 0 ? -2 * res : 0
450 paddw mm1, mm3 ; mm3 = samesign(*psrc,res)
451 movq [esi], mm1
452 add esi,8
454 sub eax, 4
455 jnz near iquant_loop
457 pop edx
458 pop edi
459 pop esi
460 pop eax
462 pop ebp ; restore stack pointer
464 emms ; clear mmx registers
465 ret
469 ;;; int32_t quant_weight_coeff_sum_mmx(int16_t *src, int16_t *i_quant_mat
470 ;;; Simply add up the sum of coefficients weighted
471 ;;; by their quantisation coefficients
472 ;;; )
473 ;;; eax - block counter...
474 ;;; edi - src
475 ;;; esi - dst
476 ;;; edx - quant_mat
478 ;; MMX Register usage
479 ;; mm7 = [1|0..3]W
480 ;; mm6 = [2047|0..3]W
481 ;; mm5 = 0
483 global quant_weight_coeff_sum_mmx
484 align 32
485 quant_weight_coeff_sum_mmx:
486 push ebp ; save frame pointer
487 mov ebp, esp ; link
489 push ecx
490 push esi
491 push edi
493 mov edi, [ebp+8] ; get pdst
494 mov esi, [ebp+12] ; get piqm
496 mov ecx, 16 ; 16 coefficient / quantiser quads to process...
497 pxor mm6, mm6 ; Accumulator
498 pxor mm7, mm7 ; Zero
499 quantsum:
500 movq mm0, [edi]
501 movq mm2, [esi]
504 ;; Compute absolute value of coefficients...
506 movq mm1, mm7
507 pcmpgtw mm1, mm0 ; (mm0 < 0 )
508 movq mm3, mm0
509 psllw mm3, 1 ; 2*mm0
510 pand mm3, mm1 ; 2*mm0 * (mm0 < 0)
511 psubw mm0, mm3 ; mm0 = abs(mm0)
515 ;; Compute the low and high words of the result....
517 movq mm1, mm0
518 pmullw mm0, mm2
519 add edi, 8
520 add esi, 8
521 pmulhw mm1, mm2
523 movq mm3, mm0
524 punpcklwd mm3, mm1
525 punpckhwd mm0, mm1
526 paddd mm6, mm3
527 paddd mm6, mm0
530 sub ecx, 1
531 jnz quantsum
533 movd eax, mm6
534 psrlq mm6, 32
535 movd ecx, mm6
536 add eax, ecx
538 pop edi
539 pop esi
540 pop ecx
542 pop ebp ; restore stack pointer
544 emms ; clear mmx registers
545 ret