1 /* Copyright (c) 2003-2005 Various contributors
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to
5 * deal in the Software without restriction, including without limitation the
6 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 * sell copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 #ifdef MT32EMU_HAVE_X86
30 #define eflag(value) __asm__ __volatile__("pushfl \n popfl \n" : : "a"(value))
31 #define cpuid_flag (1 << 21)
33 static inline bool atti386_DetectCPUID() {
37 result
= cpuid_flag
; // set test
39 if (!(result
& cpuid_flag
))
42 result
= 0; // clear test
44 if (result
& cpuid_flag
)
50 static inline bool atti386_DetectSIMD() {
53 if (atti386_DetectCPUID() == false)
63 : "=r"(result
) : : "eax", "ecx", "edx");
65 if (result
& (1 << 25))
71 static inline bool atti386_Detect3DNow() {
74 if (atti386_DetectCPUID() == false)
80 "movl $0x80000001, %%eax \n" \
84 : "=r"(result
) : : "eax", "ecx", "edx");
86 if (result
& 0x80000000)
93 static inline float atti386_iir_filter_sse(float *output
, float *hist1_ptr
, float *coef_ptr
) {
94 __asm__
__volatile__ (
97 "movss 0(%0), %%xmm1 \n" \
98 "movups 0(%1), %%xmm2 \n" \
99 "movlps 0(%2), %%xmm3 \n" \
101 "shufps $0x44, %%xmm3, %%xmm3 \n" \
103 "mulps %%xmm3, %%xmm2 \n" \
105 "subss %%xmm2, %%xmm1 \n" \
106 "shufps $0x39, %%xmm2, %%xmm2 \n" \
107 "subss %%xmm2, %%xmm1 \n" \
109 "movss %%xmm1, 0(%2) \n" \
111 "shufps $0x39, %%xmm2, %%xmm2 \n" \
112 "addss %%xmm2, %%xmm1 \n" \
114 "shufps $0x39, %%xmm2, %%xmm2 \n" \
115 "addss %%xmm2, %%xmm1 \n" \
117 "movss %%xmm3, 4(%2) \n" \
122 "movups 0(%1), %%xmm2 \n" \
124 "movlps 0(%2), %%xmm3 \n" \
125 "shufps $0x44, %%xmm3, %%xmm3 \n" \
127 "mulps %%xmm3, %%xmm2 \n" \
129 "subss %%xmm2, %%xmm1 \n" \
130 "shufps $0x39, %%xmm2, %%xmm2 \n" \
131 "subss %%xmm2, %%xmm1 \n" \
133 "movss %%xmm1, 0(%2) \n" \
135 "shufps $0x39, %%xmm2, %%xmm2 \n" \
136 "addss %%xmm2, %%xmm1 \n" \
138 "shufps $0x39, %%xmm2, %%xmm2 \n" \
139 "addss %%xmm2, %%xmm1 \n" \
141 "movss %%xmm3, 4(%2) \n" \
142 "movss %%xmm1, 0(%0) \n" \
145 : : "r"(output
), "r"(coef_ptr
), "r"(hist1_ptr
)
148 , "xmm1", "xmm2", "xmm3"
155 static inline float atti386_iir_filter_3DNow(float output
, float *hist1_ptr
, float *coef_ptr
) {
158 __asm__
__volatile__ (
159 "movq %0, %%mm1 \n" \
161 "movl %1, %%edi \n" \
162 "movq 0(%%edi), %%mm2 \n" \
164 "movl %2, %%eax; \n" \
165 "movq 0(%%eax), %%mm3 \n" \
167 "pfmul %%mm3, %%mm2 \n" \
168 "pfsub %%mm2, %%mm1 \n" \
170 "psrlq $32, %%mm2 \n" \
171 "pfsub %%mm2, %%mm1 \n" \
173 "movd %%mm1, %3 \n" \
175 "addl $8, %%edi \n" \
176 "movq 0(%%edi), %%mm2 \n" \
177 "movq 0(%%eax), %%mm3 \n" \
179 "pfmul %%mm3, %%mm2 \n" \
180 "pfadd %%mm2, %%mm1 \n" \
182 "psrlq $32, %%mm2 \n" \
183 "pfadd %%mm2, %%mm1 \n" \
188 "movd %%mm3, 4(%%eax) \n" \
190 "addl $8, %%edi \n" \
191 "addl $8, %%eax \n" \
193 "movq 0(%%edi), %%mm2 \n" \
194 "movq 0(%%eax), %%mm3 \n" \
196 "pfmul %%mm3, %%mm2 \n" \
197 "pfsub %%mm2, %%mm1 \n" \
199 "psrlq $32, %%mm2 \n" \
200 "pfsub %%mm2, %%mm1 \n" \
202 "movd %%mm1, %3 \n" \
204 "addl $8, %%edi \n" \
205 "movq 0(%%edi), %%mm2 \n" \
206 "movq 0(%%eax), %%mm3 \n" \
208 "pfmul %%mm3, %%mm2 \n" \
209 "pfadd %%mm2, %%mm1 \n" \
211 "psrlq $32, %%mm2 \n" \
212 "pfadd %%mm2, %%mm1 \n" \
216 "movd %%mm3, 4(%%eax) \n" \
218 "movd %%mm1, %0 \n" \
220 : "=m"(output
) : "g"(coef_ptr
), "g"(hist1_ptr
), "m"(tmp
)
221 : "eax", "edi", "memory"
223 , "mm1", "mm2", "mm3"
230 static inline void atti386_produceOutput1(int tmplen
, Bit16s myvolume
, Bit16s
*useBuf
, Bit16s
*snd
) {
231 __asm__
__volatile__(
232 "movl %0, %%ecx \n" \
234 "shll $16, %%eax \n" \
236 "movd %%eax, %%mm3 \n" \
237 "movd %%eax, %%mm2 \n" \
238 "psllq $32, %%mm3 \n" \
239 "por %%mm2, %%mm3 \n" \
240 "movl %2, %%esi \n" \
241 "movl %3, %%edi \n" \
243 "movq 0(%%esi), %%mm1 \n" \
244 "movq 0(%%edi), %%mm2 \n" \
245 "pmulhw %%mm3, %%mm1 \n" \
246 "paddw %%mm2, %%mm1 \n" \
247 "movq %%mm1, 0(%%edi) \n" \
249 "addl $8, %%esi \n" \
250 "addl $8, %%edi \n" \
253 "cmpl $0, %%ecx \n" \
256 : : "g"(tmplen
), "g"(myvolume
), "g"(useBuf
), "g"(snd
)
257 : "eax", "ecx", "edi", "esi", "memory"
259 , "mm1", "mm2", "mm3"
264 static inline void atti386_produceOutput2(Bit32u len
, Bit16s
*snd
, float *sndbufl
, float *sndbufr
, float *multFactor
) {
265 __asm__
__volatile__(
266 "movl %4, %%ecx \n" \
267 "shrl $1, %%ecx \n" \
268 "addl $4, %%ecx \n" \
271 "movl %0, %%esi \n" \
272 "movups 0(%%esi), %%xmm1 \n" \
274 "movl %1, %%esi \n" \
275 "movl %2, %%edi \n" \
277 "xorl %%eax, %%eax \n" \
278 "movw 0(%1), %%ax \n" \
282 "movd %%eax, %%mm1 \n" \
283 "psrlq $32, %%mm1 \n" \
284 "movw 0(%1), %%ax \n" \
287 "movd %%eax, %%mm2 \n" \
288 "por %%mm2, %%mm1 \n" \
294 "movl %1, %%esi \n" \
295 "movl %3, %%edi \n" \
300 : : "g"(multFactor
), "r"(snd
), "g"(sndbufl
), "g"(sndbufr
), "g"(len
)
301 : "eax", "ecx", "edi", "esi", "mm1", "mm2", "xmm1", "memory");
304 static inline void atti386_mixBuffers(Bit16s
* buf1
, Bit16s
*buf2
, int len
) {
305 __asm__
__volatile__(
306 "movl %0, %%ecx \n" \
307 "movl %1, %%esi \n" \
308 "movl %2, %%edi \n" \
310 "movq 0(%%edi), %%mm1 \n" \
311 "movq 0(%%esi), %%mm2 \n" \
312 "paddw %%mm2, %%mm1 \n" \
313 "movq %%mm1, 0(%%esi) \n" \
314 "addl $8, %%edi \n" \
315 "addl $8, %%esi \n" \
317 "cmpl $0, %%ecx \n" \
320 : : "g"(len
), "g"(buf1
), "g"(buf2
)
321 : "ecx", "edi", "esi", "memory"
328 static inline void atti386_mixBuffersRingMix(Bit16s
* buf1
, Bit16s
*buf2
, int len
) {
329 __asm__
__volatile__(
330 "movl %0, %%ecx \n" \
331 "movl %1, %%esi \n" \
332 "movl %2, %%edi \n" \
334 "movq 0(%%esi), %%mm1 \n" \
335 "movq 0(%%edi), %%mm2 \n" \
336 "movq %%mm1, %%mm3 \n" \
337 "pmulhw %%mm2, %%mm1 \n" \
338 "paddw %%mm3, %%mm1 \n" \
339 "movq %%mm1, 0(%%esi) \n" \
340 "addl $8, %%edi \n" \
341 "addl $8, %%esi \n" \
343 "cmpl $0, %%ecx \n" \
346 : : "g"(len
), "g"(buf1
), "g"(buf2
)
347 : "ecx", "edi", "esi", "memory"
349 , "mm1", "mm2", "mm3"
354 static inline void atti386_mixBuffersRing(Bit16s
* buf1
, Bit16s
*buf2
, int len
) {
355 __asm__
__volatile__(
356 "movl %0, %%ecx \n" \
357 "movl %1, %%esi \n" \
358 "movl %2, %%edi \n" \
360 "movq 0(%%esi), %%mm1 \n" \
361 "movq 0(%%edi), %%mm2 \n" \
362 "pmulhw %%mm2, %%mm1 \n" \
363 "movq %%mm1, 0(%%esi) \n" \
364 "addl $8, %%edi \n" \
365 "addl $8, %%esi \n" \
367 "cmpl $0, %%ecx \n" \
370 : : "g"(len
), "g"(buf1
), "g"(buf2
)
371 : "ecx", "edi", "esi", "memory"
378 static inline void atti386_partialProductOutput(int quadlen
, Bit16s leftvol
, Bit16s rightvol
, Bit16s
*partialBuf
, Bit16s
*p1buf
) {
379 __asm__
__volatile__(
380 "movl %0, %%ecx \n" \
382 "shll $16, %%eax \n" \
384 "movd %%eax, %%mm1 \n" \
385 "movd %%eax, %%mm2 \n" \
386 "psllq $32, %%mm1 \n" \
387 "por %%mm2, %%mm1 \n" \
388 "movl %3, %%edi \n" \
389 "movl %4, %%esi \n" \
392 "movw 0(%%esi), %%bx \n" \
393 "addl $2, %%esi \n" \
394 "movw 0(%%esi), %%dx \n" \
395 "addl $2, %%esi \n" \
397 "movw %%dx, %%ax \n" \
398 "shll $16, %%eax \n" \
399 "movw %%dx, %%ax \n" \
400 "movd %%eax, %%mm2 \n" \
401 "psllq $32, %%mm2 \n" \
402 "movw %%bx, %%ax \n" \
403 "shll $16, %%eax \n" \
404 "movw %%bx, %%ax \n" \
405 "movd %%eax, %%mm3 \n" \
406 "por %%mm3, %%mm2 \n" \
408 "pmulhw %%mm1, %%mm2 \n" \
409 "movq %%mm2, 0(%%edi) \n" \
410 "addl $8, %%edi \n" \
413 "cmpl $0, %%ecx \n" \
417 : : "g"(quadlen
), "g"(leftvol
), "g"(rightvol
), "g"(partialBuf
), "g"(p1buf
)
418 : "eax", "ecx", "edx", "edi", "esi", "memory"
420 , "mm1", "mm2", "mm3"
432 pop eax
// get EFLAGS into eax
433 mov ebx
,eax
// keep a copy
438 popfd
// set new EFLAGS
440 pop eax
// EFLAGS back into eax
443 // have we changed the ID bit?
446 // No, no CPUID instruction
448 // we could toggle the
449 // ID bit so CPUID is present
452 cpuid
// get processor features
453 test edx
,1<<25 // check the SIMD bit
463 return atti386_DetectSIMD();
469 bool found3D
= false;
498 return atti386_Detect3DNow();
502 float iir_filter_sse(float input
,float *hist1_ptr
, float *coef_ptr
) {
505 // 1st number of coefficients array is overall input scale factor, or filter gain
506 output
= input
* (*coef_ptr
++);
518 shufps xmm3
, xmm3
, 44h
519 // hist1_ptr+1, hist1_ptr, hist1_ptr+1, hist1_ptr
524 // Rotate elements right
525 shufps xmm2
, xmm2
, 39h
529 movss DWORD PTR
[eax
], xmm1
531 // Rotate elements right
532 shufps xmm2
, xmm2
, 39h
535 // Rotate elements right
536 shufps xmm2
, xmm2
, 39h
539 // Store previous hist
540 movss DWORD PTR
[eax
+4], xmm3
550 shufps xmm3
, xmm3
, 44h
551 // hist1_ptr+1, hist1_ptr, hist1_ptr+1, hist1_ptr
556 // Rotate elements right
557 shufps xmm2
, xmm2
, 39h
561 movss DWORD PTR
[eax
], xmm1
563 // Rotate elements right
564 shufps xmm2
, xmm2
, 39h
567 // Rotate elements right
568 shufps xmm2
, xmm2
, 39h
571 // Store previous hist
572 movss DWORD PTR
[eax
+4], xmm3
577 output
= atti386_iir_filter_sse(&output
, hist1_ptr
, coef_ptr
);
582 float iir_filter_3dnow(float input
,float *hist1_ptr
, float *coef_ptr
) {
585 // 1st number of coefficients array is overall input scale factor, or filter gain
586 output
= input
* (*coef_ptr
++);
588 // I find it very sad that 3DNow requires twice as many instructions as Intel's SSE
589 // Intel does have the upper hand here.
622 movd DWORD PTR
[eax
+4], mm3
651 movd DWORD PTR
[eax
+4], mm3
658 output
= atti386_iir_filter_3DNow(output
, hist1_ptr
, coef_ptr
);
663 #if MT32EMU_USE_MMX > 0
665 int i386_partialProductOutput(int len
, Bit16s leftvol
, Bit16s rightvol
, Bit16s
*partialBuf
, Bit16s
*mixedBuf
) {
666 int tmplen
= len
>> 1;
709 atti386_partialProductOutput(tmplen
, leftvol
, rightvol
, partialBuf
, mixedBuf
);
714 int i386_mixBuffers(Bit16s
* buf1
, Bit16s
*buf2
, int len
) {
715 int tmplen
= len
>> 2;
739 atti386_mixBuffers(buf1
, buf2
, tmplen
);
745 int i386_mixBuffersRingMix(Bit16s
* buf1
, Bit16s
*buf2
, int len
) {
746 int tmplen
= len
>> 2;
772 atti386_mixBuffersRingMix(buf1
, buf2
, tmplen
);
777 int i386_mixBuffersRing(Bit16s
* buf1
, Bit16s
*buf2
, int len
) {
778 int tmplen
= len
>> 2;
802 atti386_mixBuffersRing(buf1
, buf2
, tmplen
);
807 int i386_produceOutput1(Bit16s
*useBuf
, Bit16s
*stream
, Bit32u len
, Bit16s volume
) {
808 int tmplen
= (len
>> 1);
840 atti386_produceOutput1(tmplen
, volume
, useBuf
, stream
);