my first commit, i only added the file TEST to see how it works
[cinelerra_cv/mob.git] / mpeg2enc / mblockq_sad_mmxe.s
blob0e57ea5ef45ef3b86f638adebe8b3989af0409cf
1 ;;;
2 ;;; mblockq_sad_mmxe.s:
3 ;;;
4 ;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblock
5 ;;; quads (2 by 2 squares of adjacent macroblocks)
7 ;;; Explanation: the motion compensation search at 1-pel and 2*2 sub-sampled
8 ;;; evaluates macroblock quads. A lot of memory accesses can be saved
9 ;;; if each quad is done together rather than each macroblock in the
10 ;;; quad handled individually.
12 ;;; TODO: Really there ought to be MMX versions and the function's
13 ;;; specification should be documented...
15 ; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
19 ; This program is free software; you can reaxstribute it and/or
20 ; modify it under the terms of the GNU General Public License
21 ; as published by the Free Software Foundation; either version 2
22 ; of the License, or (at your option) any later version.
24 ; This program is distributed in the hope that it will be useful,
25 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
26 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 ; GNU General Public License for more details.
29 ; You should have received a copy of the GNU General Public License
30 ; along with this program; if not, write to the Free Software
31 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
35 ;;; CURRENTLY not used but used in testing as reference for tweaks...
36 global mblockq_sad1_REF
38 ; void mblockq_dist1_REF(char *blk1,char *blk2,int lx,int h,int *weightvec);
39 ; eax = p1
40 ; ebx = p2
41 ; ecx = unused
42 ; edx = lx;
43 ; edi = rowsleft
44 ; esi = h
46 ; mm0 = SAD (x+0,y+0)
47 ; mm1 = SAD (x+2,y+0)
48 ; mm2 = SAD (x+0,y+2)
49 ; mm3 = SAD (x+2,y+2)
50 ; mm4 = temp
51 ; mm5 = temp
52 ; mm6 = temp
53 ; mm7 = temp
55 align 32
56 mblockq_dist1_REF:
57 push ebp ; save frame pointer
58 mov ebp, esp ; link
59 push eax
60 push ebx
61 push ecx
62 push edx
63 push edi
64 push esi
66 pxor mm0, mm0 ; zero accumulators
67 pxor mm1, mm1
68 pxor mm2, mm2
69 pxor mm3, mm3
70 mov eax, [ebp+8] ; get p1
71 mov ebx, [ebp+12] ; get p2
72 mov edx, [ebp+16] ; get lx
74 mov edi, [ebp+20] ; get rowsleft
75 mov esi, edi
77 jmp nextrow_block_d1
78 align 32
79 nextrow_block_d1:
81 ;; Do the (+0,+0) SAD
83 movq mm4, [eax] ; load 1st 8 bytes of p1
84 movq mm6, mm4
85 movq mm5, [ebx]
86 psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
87 paddd mm0, mm4 ; accumulate difference
88 movq mm4, [eax+8] ; load 2nd 8 bytes of p1
89 movq mm7, mm4
90 psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
91 paddd mm0, mm4 ; accumulate difference
94 cmp edi, esi
95 jz firstrow0
97 ;; Do the (0,+2) SAD
98 sub ebx, edx
99 psadbw mm6, [ebx] ; compare to next 8 bytes of p2 (row 1)
100 paddd mm2, mm6 ; accumulate difference
101 psadbw mm7, [ebx+8] ; next 8 bytes of p1 (row 1)
102 add ebx, edx
103 paddd mm2, mm7
105 firstrow0:
107 ;; Do the (+2,0) SAD
109 movq mm4, [eax+1]
111 movq mm6, mm4
112 psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
113 paddd mm1, mm4 ; accumulate difference
114 movq mm4, [eax+9]
115 movq mm7, mm4
116 psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
117 paddd mm1, mm4 ; accumulate difference
119 cmp edi, esi
120 jz firstrow1
122 ;; Do the (+2, +2 ) SAD
123 sub ebx, edx
124 psadbw mm6, [ebx] ; compare to 1st 8 bytes of prev p2
125 psadbw mm7, [ebx+8] ; 2nd 8 bytes of prev p2
126 add ebx, edx
127 paddd mm3, mm6 ; accumulate difference
128 paddd mm3, mm7
129 firstrow1:
131 add eax, edx ; update pointer to next row
132 add ebx, edx ; ditto
134 sub edi, 1
135 jnz near nextrow_block_d1
137 ;; Do the last row of the (0,+2) SAD
139 movq mm4, [eax] ; load 1st 8 bytes of p1
140 movq mm5, [eax+8] ; load 2nd 8 bytes of p1
141 sub ebx, edx
142 psadbw mm4, [ebx] ; compare to next 8 bytes of p2 (row 1)
143 psadbw mm5, [ebx+8] ; next 8 bytes of p1 (row 1)
144 paddd mm2, mm4 ; accumulate difference
145 paddd mm2, mm5
147 movq mm4, [eax+1]
148 movq mm5, [eax+9]
150 ;; Do the last row of rhw (+2, +2) SAD
151 psadbw mm4, [ebx] ; compare to 1st 8 bytes of prev p2
152 psadbw mm5, [ebx+8] ; 2nd 8 bytes of prev p2
153 paddd mm3, mm4 ; accumulate difference
154 paddd mm3, mm5
157 mov eax, [ebp+24] ; Weightvec
158 movd [eax+0], mm0
159 movd [eax+4], mm1
160 movd [eax+8], mm2
161 movd [eax+12], mm3
163 pop esi
164 pop edi
165 pop edx
166 pop ecx
167 pop ebx
168 pop eax
170 pop ebp
171 emms
172 ret
176 global mblockq_dist1_mmxe
178 ; void mblockq_dist1_mmxe(char *blk1,char *blk2,int lx,int h,int *weightvec);
180 ; eax = p1
181 ; ebx = p2
182 ; ecx = unused
183 ; edx = lx;
184 ; edi = rowsleft
185 ; esi = h
187 ; mm0 = SAD (x+0,y+0),SAD (x+0,y+2)
188 ; mm1 = SAD (x+2,y+0),SAD (x+2,y+2)
190 ; mm4 = temp
191 ; mm5 = temp
192 ; mm6 = temp
193 ; mm7 = temp
195 align 32
196 mblockq_dist1_mmxe:
197 push ebp ; save frame pointer
198 mov ebp, esp ; link
199 push eax
200 push ebx
201 push ecx
202 push edx
203 push edi
204 push esi
206 mov eax, [ebp+8] ; get p1
207 prefetcht0 [eax]
208 pxor mm0, mm0 ; zero accumulators
209 pxor mm1, mm1
210 mov ebx, [ebp+12] ; get p2
211 mov edx, [ebp+16] ; get lx
213 mov edi, [ebp+20] ; get rowsleft
214 mov esi, edi
216 jmp nextrow_block_e1
217 align 32
218 nextrow_block_e1:
220 ;; Do the (+0,+0) SAD
221 prefetcht0 [eax+edx]
222 movq mm4, [eax] ; load 1st 8 bytes of p1
223 movq mm6, mm4
224 movq mm5, [ebx]
225 psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
226 paddd mm0, mm4 ; accumulate difference
227 movq mm4, [eax+8] ; load 2nd 8 bytes of p1
228 movq mm7, mm4
229 psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
230 paddd mm0, mm4 ; accumulate difference
233 cmp edi, esi
234 jz firstrowe0
236 ;; Do the (0,+2) SAD
237 sub ebx, edx
238 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
239 movq mm2, [ebx]
240 psadbw mm6, mm2 ; compare to next 8 bytes of p2 (row 1)
241 paddd mm0, mm6 ; accumulate difference
242 movq mm3, [ebx+8]
243 psadbw mm7, mm3 ; next 8 bytes of p1 (row 1)
244 add ebx, edx
245 paddd mm0, mm7
246 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
247 firstrowe0:
249 ;; Do the (+2,0) SAD
251 movq mm4, [eax+1]
252 movq mm6, mm4
254 psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
255 paddd mm1, mm4 ; accumulate difference
257 movq mm4, [eax+9]
258 movq mm7, mm4
260 psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
261 paddd mm1, mm4 ; accumulate difference
263 cmp edi, esi
264 jz firstrowe1
266 ;; Do the (+2, +2 ) SAD
267 sub ebx, edx
268 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
269 psadbw mm6, mm2 ; compare to 1st 8 bytes of prev p2
270 psadbw mm7, mm3 ; 2nd 8 bytes of prev p2
271 add ebx, edx
272 paddd mm1, mm6 ; accumulate difference
273 paddd mm1, mm7
274 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
275 firstrowe1:
277 add eax, edx ; update pointer to next row
278 add ebx, edx ; ditto
280 sub edi, 1
281 jnz near nextrow_block_e1
283 ;; Do the last row of the (0,+2) SAD
284 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
285 movq mm4, [eax] ; load 1st 8 bytes of p1
286 movq mm5, [eax+8] ; load 2nd 8 bytes of p1
287 sub ebx, edx
288 psadbw mm4, [ebx] ; compare to next 8 bytes of p2 (row 1)
289 psadbw mm5, [ebx+8] ; next 8 bytes of p1 (row 1)
290 paddd mm0, mm4 ; accumulate difference
291 paddd mm0, mm5
294 ;; Do the last row of rhw (+2, +2) SAD
295 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
296 movq mm4, [eax+1]
297 movq mm5, [eax+9]
299 psadbw mm4, [ebx] ; compare to 1st 8 bytes of prev p2
300 psadbw mm5, [ebx+8] ; 2nd 8 bytes of prev p2
301 paddd mm1, mm4 ; accumulate difference
302 paddd mm1, mm5
305 mov eax, [ebp+24] ; Weightvec
306 movd [eax+8], mm0
307 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
308 movd [eax+12], mm1
309 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
310 movd [eax+0], mm0
311 movd [eax+4], mm1
313 pop esi
314 pop edi
315 pop edx
316 pop ecx
317 pop ebx
318 pop eax
320 pop ebp
321 emms
324 global mblockq_dist22_mmxe
326 ; void mblockq_dist22_mmxe(unsigned char *blk1,unsigned char *blk2,int flx,int fh, int* resvec);
328 ; eax = p1
329 ; ebx = p2
330 ; ecx = counter temp
331 ; edx = flx;
333 ; mm0 = distance accumulator
334 ; mm1 = distance accumulator
335 ; mm2 = previous p1 row
336 ; mm3 = previous p1 displaced by 1 byte...
337 ; mm4 = temp
338 ; mm5 = temp
339 ; mm6 = temp
340 ; mm7 = temp / 0 if first row 0xff otherwise
343 align 32
344 mblockq_dist22_mmxe:
345 push ebp ; save frame pointer
346 mov ebp, esp
347 push eax
348 push ebx
349 push ecx
350 push edx
352 pxor mm0, mm0 ; zero acculumator
353 pxor mm1, mm1 ; zero acculumator
354 pxor mm2, mm2 ; zero acculumator
355 pxor mm3, mm3 ; zero acculumator
357 mov eax, [ebp+8] ; get p1
358 mov ebx, [ebp+12] ; get p2
359 mov edx, [ebp+16] ; get lx
360 mov ecx, [ebp+20]
361 movq mm2, [eax+edx]
362 movq mm3, [eax+edx+1]
363 jmp nextrowbd22
364 align 32
365 nextrowbd22:
366 movq mm5, [ebx] ; load previous row reference block
367 ; mm2 /mm3 containts current row target block
369 psadbw mm2, mm5 ; Comparse (x+0,y+2)
370 paddd mm1, mm2
372 psadbw mm3, mm5 ; Compare (x+2,y+2)
373 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
374 paddd mm1, mm3
376 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
378 movq mm2, [eax] ; Load current row traget block into mm2 / mm3
379 movq mm6, mm2
380 movq mm3, [eax+1]
381 sub eax, edx
382 sub ebx, edx
383 prefetcht0 [eax]
384 movq mm7, mm3
386 psadbw mm6, mm5 ; Compare (x+0,y+0)
387 paddd mm0, mm6
388 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
389 psadbw mm7, mm5 ; Compare (x+2,y+0)
390 paddd mm0, mm7
391 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
393 sub ecx, 1
394 jnz nextrowbd22
396 mov eax, [ebp+24]
397 movq [eax+0], mm0
398 movq [eax+8], mm1
399 pop edx
400 pop ecx
401 pop ebx
402 pop eax
403 pop ebp
405 emms