4 #include <TargetConditionals.h>
12 #if !TARGET_IPHONE_SIMULATOR
15 "fmrx %[tmp], fpscr\n\t"
16 "orr %[tmp], %[tmp], #(3 << 16)\n\t" /* set vector size to 4 */
17 "fmxr fpscr, %[tmp]\n\t"
24 inline void releaseVFP()
26 #if !TARGET_IPHONE_SIMULATOR
29 "fmrx %[tmp], fpscr\n\t"
30 "bic %[tmp], %[tmp], #(7 << 16)\n\t" /* set vector size back to 1 */
31 "fmxr fpscr, %[tmp]\n\t"
39 inline void vfill(float *dest
, float val
, int len
)
41 #if !TARGET_IPHONE_SIMULATOR
42 float t
[4] = {val
, val
, val
, val
};
43 float *v
= (float *) t
;
46 "fmrx %[tmp], fpscr\n\t"
47 "orr %[tmp], %[tmp], #(3 << 16)\n\t" /* set vector size to 4 */
48 "fmxr fpscr, %[tmp]\n\t"
50 "fldmias %[src1], {s8-s11}\n\t"
52 "subs %[len], %[len], #16\n\t"
53 "fstmias %[dst]!, {s8-s11}\n\t"
54 "fstmias %[dst]!, {s8-s11}\n\t"
55 "fstmiasge %[dst]!, {s8-s11}\n\t"
56 "fstmiasge %[dst]!, {s8-s11}\n\t"
59 "bic %[tmp], %[tmp], #(7 << 16)\n\t" /* set vector size back to 1 */
60 "fmxr fpscr, %[tmp]\n\t"
61 : [dst
] "+&r" (dest
), [src1
] "+&r" (v
), [len
] "+&r" (len
), [tmp
] "=&r" (tmp
)
63 : "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
64 "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15",
65 "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
66 "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31",
73 inline void vmuladd(float *dest
, float *a
, float *b
, float *c
, int len
)
75 #if !TARGET_IPHONE_SIMULATOR
78 "fmrx %[tmp], fpscr\n\t"
79 "orr %[tmp], %[tmp], #(3 << 16)\n\t" /* set vector size to 4 */
80 "fmxr fpscr, %[tmp]\n\t"
82 "fldmias %[src2]!, {s0-s3}\n\t"
83 "fldmias %[src1]!, {s8-s11}\n\t"
84 "fldmias %[src3]!, {s24-s27}\n\t"
85 "fldmias %[src2]!, {s4-s7}\n\t"
86 "fldmias %[src1]!, {s12-s15}\n\t"
87 "fldmias %[src3]!, {s28-s31}\n\t"
88 "fmacs s8, s0, s24\n\t"
90 "subs %[len], %[len], #16\n\t"
91 "fmacs s12, s4, s28\n\t"
92 "fldmiasge %[src2]!, {s0-s3}\n\t"
93 "fldmiasge %[src1]!, {s16-s19}\n\t"
94 "fldmiasge %[src3]!, {s24-s27}\n\t"
95 "fldmiasge %[src2]!, {s4-s7}\n\t"
96 "fldmiasge %[src1]!, {s20-s23}\n\t"
97 "fldmiasge %[src3]!, {s28-s31}\n\t"
98 "fmacsge s16, s0, s24\n\t"
99 "fstmias %[dst]!, {s8-s11}\n\t"
100 "fstmias %[dst]!, {s12-s15}\n\t"
101 "fmacsge s20, s4, s28\n\t"
102 "fldmiasgt %[src2]!, {s0-s3}\n\t"
103 "fldmiasgt %[src1]!, {s8-s11}\n\t"
104 "fldmiasgt %[src3]!, {s24-s27}\n\t"
105 "fldmiasgt %[src2]!, {s4-s7}\n\t"
106 "fldmiasgt %[src1]!, {s12-s15}\n\t"
107 "fldmiasgt %[src3]!, {s28-s31}\n\t"
108 "fmacsge s8, s0, s24\n\t"
109 "fstmiasge %[dst]!, {s16-s19}\n\t"
110 "fstmiasge %[dst]!, {s20-s23}\n\t"
113 "bic %[tmp], %[tmp], #(7 << 16)\n\t" /* set vector size back to 1 */
114 "fmxr fpscr, %[tmp]\n\t"
115 : [dst
] "+&r" (dest
), [src1
] "+&r" (a
), [src2
] "+&r" (b
), [src3
] "+&r" (c
), [len
] "+&r" (len
), [tmp
] "=&r" (tmp
)
117 : "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
118 "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15",
119 "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
120 "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31",
125 inline void vscalarmul(float *dest
, float scalar
, float *b
, int len
)
127 #if !TARGET_IPHONE_SIMULATOR
128 float t
[4] = {scalar
, scalar
, scalar
, scalar
};
132 __asm__
__volatile__(
133 "fmrx %[tmp], fpscr\n\t"
134 "orr %[tmp], %[tmp], #(3 << 16)\n\t" /* set vector size to 4 */
135 "fmxr fpscr, %[tmp]\n\t"
137 "fldmias %[src1], {s0-s3}\n\t"
138 "fldmias %[src2]!, {s8-s11}\n\t"
139 "fldmias %[src2]!, {s12-s15}\n\t"
140 "fmuls s8, s8, s0\n\t"
142 "subs %[len], %[len], #16\n\t"
143 "fmuls s12, s12, s0\n\t"
144 "fldmiasge %[src2]!, {s24-s27}\n\t"
145 "fldmiasge %[src2]!, {s28-s31}\n\t"
146 "fmulsge s24, s24, s0\n\t"
147 "fstmias %[dst]!, {s8-s11}\n\t"
148 "fstmias %[dst]!, {s12-s15}\n\t"
149 "fmulsge s28, s28, s0\n\t"
150 "fldmiasgt %[src2]!, {s8-s11}\n\t"
151 "fldmiasgt %[src2]!, {s12-s15}\n\t"
152 "fmulsge s8, s8, s0\n\t"
153 "fstmiasge %[dst]!, {s24-s27}\n\t"
154 "fstmiasge %[dst]!, {s28-s31}\n\t"
157 "bic %[tmp], %[tmp], #(7 << 16)\n\t" /* set vector size back to 1 */
158 "fmxr fpscr, %[tmp]\n\t"
159 : [dst
] "+&r" (dest
), [src1
] "+&r" (s
), [src2
] "+&r" (b
), [len
] "+&r" (len
), [tmp
] "=&r" (tmp
)
161 : "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
162 "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15",
163 "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
164 "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31",
169 inline void vmul(float *dest
, float *a
, const float *b
, int len
)
171 #if !TARGET_IPHONE_SIMULATOR
173 __asm__
__volatile__(
174 "fmrx %[tmp], fpscr\n\t"
175 "orr %[tmp], %[tmp], #(3 << 16)\n\t" /* set vector size to 4 */
176 "fmxr fpscr, %[tmp]\n\t"
178 "fldmias %[src1]!, {s0-s3}\n\t"
179 "fldmias %[src2]!, {s8-s11}\n\t"
180 "fldmias %[src1]!, {s4-s7}\n\t"
181 "fldmias %[src2]!, {s12-s15}\n\t"
182 "fmuls s8, s0, s8\n\t"
184 "subs %[len], %[len], #16\n\t"
185 "fmuls s12, s4, s12\n\t"
186 "fldmiasge %[src1]!, {s16-s19}\n\t"
187 "fldmiasge %[src2]!, {s24-s27}\n\t"
188 "fldmiasge %[src1]!, {s20-s23}\n\t"
189 "fldmiasge %[src2]!, {s28-s31}\n\t"
190 "fmulsge s24, s16, s24\n\t"
191 "fstmias %[dst]!, {s8-s11}\n\t"
192 "fstmias %[dst]!, {s12-s15}\n\t"
193 "fmulsge s28, s20, s28\n\t"
194 "fldmiasgt %[src1]!, {s0-s3}\n\t"
195 "fldmiasgt %[src2]!, {s8-s11}\n\t"
196 "fldmiasgt %[src1]!, {s4-s7}\n\t"
197 "fldmiasgt %[src2]!, {s12-s15}\n\t"
198 "fmulsge s8, s0, s8\n\t"
199 "fstmiasge %[dst]!, {s24-s27}\n\t"
200 "fstmiasge %[dst]!, {s28-s31}\n\t"
203 "bic %[tmp], %[tmp], #(7 << 16)\n\t" /* set vector size back to 1 */
204 "fmxr fpscr, %[tmp]\n\t"
205 : [dst
] "+&r" (dest
), [src1
] "+&r" (a
), [src2
] "+&r" (b
), [len
] "+&r" (len
), [tmp
] "=&r" (tmp
)
207 : "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
208 "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15",
209 "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
210 "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31",
216 inline void vadd(float *dest
, float *a
, const float *b
, int len
)
218 #if !TARGET_IPHONE_SIMULATOR
220 __asm__
__volatile__(
221 "fmrx %[tmp], fpscr\n\t"
222 "orr %[tmp], %[tmp], #(3 << 16)\n\t" /* set vector size to 4 */
223 "fmxr fpscr, %[tmp]\n\t"
225 "fldmias %[src1]!, {s0-s3}\n\t"
226 "fldmias %[src2]!, {s8-s11}\n\t"
227 "fldmias %[src1]!, {s4-s7}\n\t"
228 "fldmias %[src2]!, {s12-s15}\n\t"
229 "fadds s8, s0, s8\n\t"
231 "subs %[len], %[len], #16\n\t"
232 "fadds s12, s4, s12\n\t"
233 "fldmiasge %[src1]!, {s16-s19}\n\t"
234 "fldmiasge %[src2]!, {s24-s27}\n\t"
235 "fldmiasge %[src1]!, {s20-s23}\n\t"
236 "fldmiasge %[src2]!, {s28-s31}\n\t"
237 "faddsge s24, s16, s24\n\t"
238 "fstmias %[dst]!, {s8-s11}\n\t"
239 "fstmias %[dst]!, {s12-s15}\n\t"
240 "faddsge s28, s20, s28\n\t"
241 "fldmiasgt %[src1]!, {s0-s3}\n\t"
242 "fldmiasgt %[src2]!, {s8-s11}\n\t"
243 "fldmiasgt %[src1]!, {s4-s7}\n\t"
244 "fldmiasgt %[src2]!, {s12-s15}\n\t"
245 "faddsge s8, s0, s8\n\t"
246 "fstmiasge %[dst]!, {s24-s27}\n\t"
247 "fstmiasge %[dst]!, {s28-s31}\n\t"
250 "bic %[tmp], %[tmp], #(7 << 16)\n\t" /* set vector size back to 1 */
251 "fmxr fpscr, %[tmp]\n\t"
252 : [dst
] "+&r" (dest
), [src1
] "+&r" (a
), [src2
] "+&r" (b
), [len
] "+&r" (len
), [tmp
] "=&r" (tmp
)
254 : "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
255 "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15",
256 "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
257 "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31",
262 inline void vcopy(float *dest
, float *a
, int len
)
264 #if !TARGET_IPHONE_SIMULATOR
266 __asm__
__volatile__(
267 "fmrx %[tmp], fpscr\n\t"
268 "orr %[tmp], %[tmp], #(3 << 16)\n\t" /* set vector size to 4 */
269 "fmxr fpscr, %[tmp]\n\t"
271 "fldmias %[src1]!, {s8-s11}\n\t"
272 "fldmias %[src1]!, {s12-s15}\n\t"
274 "subs %[len], %[len], #16\n\t"
275 "fldmiasge %[src1]!, {s24-s27}\n\t"
276 "fldmiasge %[src1]!, {s28-s31}\n\t"
277 "fstmias %[dst]!, {s8-s11}\n\t"
278 "fstmias %[dst]!, {s12-s15}\n\t"
279 "fldmiasgt %[src1]!, {s8-s11}\n\t"
280 "fldmiasgt %[src1]!, {s12-s15}\n\t"
281 "fstmiasge %[dst]!, {s24-s27}\n\t"
282 "fstmiasge %[dst]!, {s28-s31}\n\t"
285 "bic %[tmp], %[tmp], #(7 << 16)\n\t" /* set vector size back to 1 */
286 "fmxr fpscr, %[tmp]\n\t"
287 : [dst
] "+&r" (dest
), [src1
] "+&r" (a
), [len
] "+&r" (len
), [tmp
] "=&r" (tmp
)
289 : "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
290 "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15",
291 "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
292 "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31",