2 * Mesa 3-D graphics library
5 * Copyright (C) 1999-2007 Brian Paul All Rights Reserved.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
32 .globl _mesa_x86_64_cpuid
33 .hidden _mesa_x86_64_cpuid
49 .globl _mesa_x86_64_transform_points4_general
50 .hidden _mesa_x86_64_transform_points4_general
51 _mesa_x86_64_transform_points4_general:
57 movl V4F_COUNT(%rdx), %ecx /* count */
58 movzbl V4F_STRIDE(%rdx), %eax /* stride */
60 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
61 movl $4, V4F_SIZE(%rdi) /* set dest size */
62 .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */
63 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
65 testl %ecx, %ecx /* verify non-zero count */
69 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
70 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
74 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
75 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
76 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
77 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
78 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
82 movups (%rdx), %xmm8 /* ox | oy | oz | ow */
85 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
87 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
88 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
89 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
90 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
91 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
92 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
93 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
94 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
95 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
97 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
99 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
113 .byte 0xff, 0xff, 0xff, 0xff
114 .byte 0xff, 0xff, 0xff, 0xff
115 .byte 0xff, 0xff, 0xff, 0xff
116 .byte 0x00, 0x00, 0x00, 0x00
118 .byte 0x00, 0x00, 0x00, 0x00
119 .byte 0x00, 0x00, 0x00, 0x00
120 .byte 0x00, 0x00, 0x00, 0x00
125 .globl _mesa_x86_64_transform_points4_3d
126 .hidden _mesa_x86_64_transform_points4_3d
128 * this is slower than _mesa_x86_64_transform_points4_general
129 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
131 _mesa_x86_64_transform_points4_3d:
133 leaq p4_constants(%rip), %rax
138 movaps 16(%rax), %xmm10
140 movl V4F_COUNT(%rdx), %ecx /* count */
141 movzbl V4F_STRIDE(%rdx), %eax /* stride */
143 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
144 movl $4, V4F_SIZE(%rdi) /* set dest size */
145 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
147 testl %ecx, %ecx /* verify non-zero count */
150 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
151 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
155 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
156 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
157 andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */
158 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
159 andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */
160 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
161 andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */
162 andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */
163 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
164 orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */
168 movups (%rdx), %xmm8 /* ox | oy | oz | ow */
171 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
173 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
174 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
175 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
176 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
177 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
178 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
179 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
180 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
181 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
183 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
185 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
197 .globl _mesa_x86_64_transform_points4_identity
198 .hidden _mesa_x86_64_transform_points4_identity
199 _mesa_x86_64_transform_points4_identity:
201 movl V4F_COUNT(%rdx), %ecx /* count */
202 movzbl V4F_STRIDE(%rdx), %eax /* stride */
204 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
205 movl $4, V4F_SIZE(%rdi) /* set dest size */
206 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
211 movq V4F_START(%rdx), %rsi /* ptr to first src vertex */
212 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
226 .globl _mesa_3dnow_transform_points4_3d_no_rot
227 .hidden _mesa_3dnow_transform_points4_3d_no_rot
228 _mesa_3dnow_transform_points4_3d_no_rot:
230 movl V4F_COUNT(%rdx), %ecx /* count */
231 movzbl V4F_STRIDE(%rdx), %eax /* stride */
233 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
234 movl $4, V4F_SIZE(%rdi) /* set dest size */
235 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
236 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
239 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
242 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
243 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
247 movd (%rsi), %mm0 /* | m00 */
248 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
249 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
251 movd 40(%rsi), %mm2 /* | m22 */
252 movq 48(%rsi), %mm1 /* m31 | m30 */
254 punpckldq 56(%rsi), %mm2 /* m11 | m00 */
260 movq (%rdx), %mm4 /* x1 | x0 */
261 movq 8(%rdx), %mm5 /* x3 | x2 */
262 movd 12(%rdx), %mm7 /* | x3 */
264 movq %mm5, %mm6 /* x3 | x2 */
265 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
267 punpckhdq %mm6, %mm6 /* x3 | x3 */
268 pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */
270 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
271 pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */
273 pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
276 movq %mm4, (%rdi) /* write r0, r1 */
277 movq %mm5, 8(%rdi) /* write r2, r3 */
283 jnz p4_3d_no_rot_loop
291 .globl _mesa_3dnow_transform_points4_perspective
292 .hidden _mesa_3dnow_transform_points4_perspective
293 _mesa_3dnow_transform_points4_perspective:
295 movl V4F_COUNT(%rdx), %ecx /* count */
296 movzbl V4F_STRIDE(%rdx), %eax /* stride */
298 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
299 movl $4, V4F_SIZE(%rdi) /* set dest size */
300 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
303 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
304 jz p4_perspective_done
306 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
307 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
309 movd (%rsi), %mm0 /* | m00 */
310 pxor %mm7, %mm7 /* 0 | 0 */
311 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
313 movq 32(%rsi), %mm2 /* m21 | m20 */
316 movd 40(%rsi), %mm1 /* | m22 */
318 .byte 0x66, 0x66, 0x90 /* manual align += 3 */
319 punpckldq 56(%rsi), %mm1 /* m32 | m22 */
324 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
326 movq (%rdx), %mm4 /* x1 | x0 */
327 movq 8(%rdx), %mm5 /* x3 | x2 */
328 movd 8(%rdx), %mm3 /* | x2 */
330 movq %mm5, %mm6 /* x3 | x2 */
331 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
333 punpckldq %mm5, %mm5 /* x2 | x2 */
335 pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */
336 pfsubr %mm7, %mm3 /* | -x2 */
338 pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */
339 pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */
341 pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */
343 movq %mm5, (%rdi) /* write r0, r1 */
345 movq %mm6, 8(%rdi) /* write r2, r3 */
350 prefetch 32(%rdx) /* hopefully stride is zero */
351 jnz p4_perspective_loop
358 .globl _mesa_3dnow_transform_points4_2d_no_rot
359 .hidden _mesa_3dnow_transform_points4_2d_no_rot
360 _mesa_3dnow_transform_points4_2d_no_rot:
362 movl V4F_COUNT(%rdx), %ecx /* count */
363 movzbl V4F_STRIDE(%rdx), %eax /* stride */
365 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
366 movl $4, V4F_SIZE(%rdi) /* set dest size */
367 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
370 .byte 0x90 /* manual align += 1 */
373 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
374 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
376 movd (%rsi), %mm0 /* | m00 */
378 punpckldq 20(%rsi), %mm0 /* m11 | m00 */
380 movq 48(%rsi), %mm1 /* m31 | m30 */
384 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
386 movq (%rdx), %mm4 /* x1 | x0 */
387 movq 8(%rdx), %mm5 /* x3 | x2 */
389 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
390 movq %mm5, %mm6 /* x3 | x2 */
392 punpckhdq %mm6, %mm6 /* x3 | x3 */
395 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
397 prefetch 32(%rdx) /* hopefully stride is zero */
398 pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
400 movq %mm6, (%rdi) /* write r0, r1 */
401 movq %mm5, 8(%rdi) /* write r2, r3 */
406 jnz p4_2d_no_rot_loop
414 .globl _mesa_3dnow_transform_points4_2d
415 .hidden _mesa_3dnow_transform_points4_2d
416 _mesa_3dnow_transform_points4_2d:
418 movl V4F_COUNT(%rdx), %ecx /* count */
419 movzbl V4F_STRIDE(%rdx), %eax /* stride */
421 movl %ecx, V4F_COUNT(%rdi) /* set dest count */
422 movl $4, V4F_SIZE(%rdi) /* set dest size */
423 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
424 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
427 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
430 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
431 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
433 movd (%rsi), %mm0 /* | m00 */
434 movd 4(%rsi), %mm1 /* | m01 */
438 punpckldq 16(%rsi), %mm0 /* m10 | m00 */
439 .byte 0x66, 0x66, 0x90 /* manual align += 4 */
440 punpckldq 20(%rsi), %mm1 /* m11 | m01 */
442 movq 48(%rsi), %mm2 /* m31 | m30 */
446 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
448 movq (%rdx), %mm3 /* x1 | x0 */
449 movq 8(%rdx), %mm5 /* x3 | x2 */
451 movq %mm3, %mm4 /* x1 | x0 */
452 movq %mm5, %mm6 /* x3 | x2 */
454 pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */
455 punpckhdq %mm6, %mm6 /* x3 | x3 */
457 pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */
460 pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */
462 pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */
463 prefetch 32(%rdx) /* hopefully stride is zero */
465 pfadd %mm6, %mm3 /* r1 | r0 */
467 movq %mm3, (%rdi) /* write r0, r1 */
468 movq %mm5, 8(%rdi) /* write r2, r3 */
481 #if defined (__ELF__) && defined (__linux__)
482 .section .note.GNU-stack,"",%progbits