aarch64: Add assembly support for -fsanitize=hwaddress tagged globals.
[libav.git] / libavcodec / x86 / h263_loopfilter.asm
blobcd726ba86d8d69f7c50be3805f668b1d883f8d9a
1 ;******************************************************************************
2 ;* MMX-optimized H.263 loop filter
3 ;*
4 ;* This file is part of Libav.
5 ;*
6 ;* Libav is free software; you can redistribute it and/or
7 ;* modify it under the terms of the GNU Lesser General Public
8 ;* License as published by the Free Software Foundation; either
9 ;* version 2.1 of the License, or (at your option) any later version.
11 ;* Libav is distributed in the hope that it will be useful,
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 ;* Lesser General Public License for more details.
16 ;* You should have received a copy of the GNU Lesser General Public
17 ;* License along with Libav; if not, write to the Free Software
18 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 ;******************************************************************************
21 %include "libavutil/x86/x86util.asm"
23 SECTION_RODATA
24 cextern pb_FC
25 cextern h263_loop_filter_strength
27 SECTION .text
29 %macro H263_LOOP_FILTER 5
30 pxor m7, m7
31 mova m0, [%1]
32 mova m1, [%1]
33 mova m2, [%4]
34 mova m3, [%4]
35 punpcklbw m0, m7
36 punpckhbw m1, m7
37 punpcklbw m2, m7
38 punpckhbw m3, m7
39 psubw m0, m2
40 psubw m1, m3
41 mova m2, [%2]
42 mova m3, [%2]
43 mova m4, [%3]
44 mova m5, [%3]
45 punpcklbw m2, m7
46 punpckhbw m3, m7
47 punpcklbw m4, m7
48 punpckhbw m5, m7
49 psubw m4, m2
50 psubw m5, m3
51 psllw m4, 2
52 psllw m5, 2
53 paddw m4, m0
54 paddw m5, m1
55 pxor m6, m6
56 pcmpgtw m6, m4
57 pcmpgtw m7, m5
58 pxor m4, m6
59 pxor m5, m7
60 psubw m4, m6
61 psubw m5, m7
62 psrlw m4, 3
63 psrlw m5, 3
64 packuswb m4, m5
65 packsswb m6, m7
66 pxor m7, m7
67 movd m2, %5
68 punpcklbw m2, m2
69 punpcklbw m2, m2
70 punpcklbw m2, m2
71 psubusb m2, m4
72 mova m3, m2
73 psubusb m3, m4
74 psubb m2, m3
75 mova m3, [%2]
76 mova m4, [%3]
77 pxor m3, m6
78 pxor m4, m6
79 paddusb m3, m2
80 psubusb m4, m2
81 pxor m3, m6
82 pxor m4, m6
83 paddusb m2, m2
84 packsswb m0, m1
85 pcmpgtb m7, m0
86 pxor m0, m7
87 psubb m0, m7
88 mova m1, m0
89 psubusb m0, m2
90 psubb m1, m0
91 pand m1, [pb_FC]
92 psrlw m1, 2
93 pxor m1, m7
94 psubb m1, m7
95 mova m5, [%1]
96 mova m6, [%4]
97 psubb m5, m1
98 paddb m6, m1
99 %endmacro
101 INIT_MMX mmx
102 ; void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
103 cglobal h263_v_loop_filter, 3,5
104 movsxdifnidn r1, r1d
105 movsxdifnidn r2, r2d
107 lea r4, [h263_loop_filter_strength]
108 movzx r3d, BYTE [r4+r2]
109 movsx r2, r3b
110 shl r2, 1
112 mov r3, r0
113 sub r3, r1
114 mov r4, r3
115 sub r4, r1
116 H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
118 mova [r3], m3
119 mova [r0], m4
120 mova [r4], m5
121 mova [r0+r1], m6
124 %macro TRANSPOSE4X4 2
125 movd m0, [%1]
126 movd m1, [%1+r1]
127 movd m2, [%1+r1*2]
128 movd m3, [%1+r3]
129 punpcklbw m0, m1
130 punpcklbw m2, m3
131 mova m1, m0
132 punpcklwd m0, m2
133 punpckhwd m1, m2
134 movd [%2+ 0], m0
135 punpckhdq m0, m0
136 movd [%2+ 8], m0
137 movd [%2+16], m1
138 punpckhdq m1, m1
139 movd [%2+24], m1
140 %endmacro
143 ; void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
144 INIT_MMX mmx
145 cglobal h263_h_loop_filter, 3,5,0,32
146 movsxdifnidn r1, r1d
147 movsxdifnidn r2, r2d
149 lea r4, [h263_loop_filter_strength]
150 movzx r3d, BYTE [r4+r2]
151 movsx r2, r3b
152 shl r2, 1
154 sub r0, 2
155 lea r3, [r1*3]
157 TRANSPOSE4X4 r0, rsp
158 lea r4, [r0+r1*4]
159 TRANSPOSE4X4 r4, rsp+4
161 H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d
163 mova m1, m5
164 mova m0, m4
165 punpcklbw m5, m3
166 punpcklbw m4, m6
167 punpckhbw m1, m3
168 punpckhbw m0, m6
169 mova m3, m5
170 mova m6, m1
171 punpcklwd m5, m4
172 punpcklwd m1, m0
173 punpckhwd m3, m4
174 punpckhwd m6, m0
175 movd [r0], m5
176 punpckhdq m5, m5
177 movd [r0+r1*1], m5
178 movd [r0+r1*2], m3
179 punpckhdq m3, m3
180 movd [r0+r3], m3
181 movd [r4], m1
182 punpckhdq m1, m1
183 movd [r4+r1*1], m1
184 movd [r4+r1*2], m6
185 punpckhdq m6, m6
186 movd [r4+r3], m6