1 ; NASM assembly language code for PAQ7.
2 ; (C) 2005, Matt Mahoney.
3 ; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
5 ; MINGW g++: nasm paq7asm.asm -f win32 --prefix _
6 ; DJGPP g++: nasm paq7asm.asm -f coff --prefix _
7 ; Borland, Mars: nasm paq7asm.asm -f obj --prefix _
8 ; Linux: nasm paq7asm.asm -f elf
10 ; For other Windows compilers try -f win32 or -f obj. Some old versions
11 ; of Linux should use -f aout instead of -f elf.
13 ; This code will only work on a Pentium-MMX or higher. It doesn't
14 ; use extended (Katmai/SSE) instructions. It won't work
17 section .text use32 class
=CODE
25 ; Vector product a*b of n signed words, returning signed dword scaled
26 ; down by 8 bits. n is rounded up to a multiple of 8.
28 global dot_product
; (short* a, short* b, int n)
34 add ecx, 7 ; n rounding up
39 pxor mm0
, mm0
; sum = 0
40 .
loop: ; each loop sums 4 products
41 movq mm1
, [eax+ecx*2] ; put halves of vector product in mm0
42 pmaddwd mm1
, [edx+ecx*2]
43 movq mm2
, [eax+ecx*2-8]
44 pmaddwd mm2
, [edx+ecx*2-8]
51 movq mm1
, mm0
; add 2 halves of mm0 and return in eax
59 ; This should work on a Pentium 4 or higher in 32-bit mode,
60 ; but it isn't much faster than the MMX version so I don't use it.
62 global dot_product_sse2
; (short* a, short* b, int n)
68 add ecx, 7 ; n rounding up
73 pxor xmm0
, xmm0
; sum = 0
74 .
loop: ; each loop sums 4 products
75 movdqa xmm1
, [eax+ecx*2] ; put parital sums of vector product in xmm0
76 pmaddwd xmm1
, [edx+ecx*2]
81 movdqa xmm1
, xmm0
; add 4 parts of xmm0 and return in eax
92 ; Train n neural network weights w[n] on inputs t[n] and err.
93 ; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
94 ; n is rounded up to a multiple of 8.
96 global train
; (short* t, short* w, int n, int err)
99 mov eax, [esp+16] ; err
100 and eax, 0xffff ; put 4 copies of err in mm0
108 pcmpeqb mm1
, mm1
; 4 copies of 1 in mm1
112 mov ecx, [esp+12] ; n
113 add ecx, 7 ; n/8 rounding up
118 .
loop: ; each iteration adjusts 8 weights
119 movq mm2
, [edx+ecx*2] ; w[i]
120 movq mm3
, [eax+ecx*2] ; t[i]
121 movq mm4
, [edx+ecx*2-8] ; w[i]
122 movq mm5
, [eax+ecx*2-8] ; t[i]
133 movq
[edx+ecx*2], mm2
134 movq
[edx+ecx*2-8], mm4