4 * Copyright (c) 2022 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "internals.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "exec/helper-proto.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/exec-all.h"
27 #include "qemu/int128.h"
28 #include "fpu/softfloat.h"
29 #include "vec_internal.h"
30 #include "sve_ldst_internal.h"
32 void helper_set_svcr(CPUARMState
*env
, uint32_t val
, uint32_t mask
)
34 aarch64_set_svcr(env
, val
, mask
);
37 void helper_sme_zero(CPUARMState
*env
, uint32_t imm
, uint32_t svl
)
42 * Special case clearing the entire ZA space.
43 * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any
44 * parts of the ZA storage outside of SVL.
47 memset(env
->zarray
, 0, sizeof(env
->zarray
));
52 * Recall that ZAnH.D[m] is spread across ZA[n+8*m],
53 * so each row is discontiguous within ZA[].
55 for (i
= 0; i
< svl
; i
++) {
56 if (imm
& (1 << (i
% 8))) {
57 memset(&env
->zarray
[i
], 0, svl
);
64 * When considering the ZA storage as an array of elements of
65 * type T, the index within that array of the Nth element of
66 * a vertical slice of a tile can be calculated like this,
67 * regardless of the size of type T. This is because the tiles
68 * are interleaved, so if type T is size N bytes then row 1 of
69 * the tile is N rows away from row 0. The division by N to
70 * convert a byte offset into an array index and the multiplication
71 * by N to convert from vslice-index-within-the-tile to
72 * the index within the ZA storage cancel out.
74 #define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg))
77 * When doing byte arithmetic on the ZA storage, the element
78 * byteoff bytes away in a tile vertical slice is always this
79 * many bytes away in the ZA storage, regardless of the
80 * size of the tile element, assuming that byteoff is a multiple
81 * of the element size. Again this is because of the interleaving
82 * of the tiles. For instance if we have 1 byte per element then
83 * each row of the ZA storage has one byte of the vslice data,
84 * and (counting from 0) byte 8 goes in row 8 of the storage
85 * at offset (8 * row-size-in-bytes).
86 * If we have 8 bytes per element then each row of the ZA storage
87 * has 8 bytes of the data, but there are 8 interleaved tiles and
88 * so byte 8 of the data goes into row 1 of the tile,
89 * which is again row 8 of the storage, so the offset is still
90 * (8 * row-size-in-bytes). Similarly for other element sizes.
92 #define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg))
96 * Move Zreg vector to ZArray column.
98 #define DO_MOVA_C(NAME, TYPE, H) \
99 void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc) \
101 int i, oprsz = simd_oprsz(desc); \
102 for (i = 0; i < oprsz; ) { \
103 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
106 *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \
109 pg >>= sizeof(TYPE); \
114 DO_MOVA_C(sme_mova_cz_b
, uint8_t, H1
)
115 DO_MOVA_C(sme_mova_cz_h
, uint16_t, H1_2
)
116 DO_MOVA_C(sme_mova_cz_s
, uint32_t, H1_4
)
118 void HELPER(sme_mova_cz_d
)(void *za
, void *vn
, void *vg
, uint32_t desc
)
120 int i
, oprsz
= simd_oprsz(desc
) / 8;
125 for (i
= 0; i
< oprsz
; i
++) {
127 a
[tile_vslice_index(i
)] = n
[i
];
132 void HELPER(sme_mova_cz_q
)(void *za
, void *vn
, void *vg
, uint32_t desc
)
134 int i
, oprsz
= simd_oprsz(desc
) / 16;
140 * Int128 is used here simply to copy 16 bytes, and to simplify
141 * the address arithmetic.
143 for (i
= 0; i
< oprsz
; i
++) {
145 a
[tile_vslice_index(i
)] = n
[i
];
153 * Move ZArray column to Zreg vector.
155 #define DO_MOVA_Z(NAME, TYPE, H) \
156 void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc) \
158 int i, oprsz = simd_oprsz(desc); \
159 for (i = 0; i < oprsz; ) { \
160 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
163 *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \
166 pg >>= sizeof(TYPE); \
171 DO_MOVA_Z(sme_mova_zc_b
, uint8_t, H1
)
172 DO_MOVA_Z(sme_mova_zc_h
, uint16_t, H1_2
)
173 DO_MOVA_Z(sme_mova_zc_s
, uint32_t, H1_4
)
175 void HELPER(sme_mova_zc_d
)(void *vd
, void *za
, void *vg
, uint32_t desc
)
177 int i
, oprsz
= simd_oprsz(desc
) / 8;
182 for (i
= 0; i
< oprsz
; i
++) {
184 d
[i
] = a
[tile_vslice_index(i
)];
189 void HELPER(sme_mova_zc_q
)(void *vd
, void *za
, void *vg
, uint32_t desc
)
191 int i
, oprsz
= simd_oprsz(desc
) / 16;
197 * Int128 is used here simply to copy 16 bytes, and to simplify
198 * the address arithmetic.
200 for (i
= 0; i
< oprsz
; i
++, za
+= sizeof(ARMVectorReg
)) {
202 d
[i
] = a
[tile_vslice_index(i
)];
210 * Clear elements in a tile slice comprising len bytes.
213 typedef void ClearFn(void *ptr
, size_t off
, size_t len
);
215 static void clear_horizontal(void *ptr
, size_t off
, size_t len
)
217 memset(ptr
+ off
, 0, len
);
220 static void clear_vertical_b(void *vptr
, size_t off
, size_t len
)
222 for (size_t i
= 0; i
< len
; ++i
) {
223 *(uint8_t *)(vptr
+ tile_vslice_offset(i
+ off
)) = 0;
227 static void clear_vertical_h(void *vptr
, size_t off
, size_t len
)
229 for (size_t i
= 0; i
< len
; i
+= 2) {
230 *(uint16_t *)(vptr
+ tile_vslice_offset(i
+ off
)) = 0;
234 static void clear_vertical_s(void *vptr
, size_t off
, size_t len
)
236 for (size_t i
= 0; i
< len
; i
+= 4) {
237 *(uint32_t *)(vptr
+ tile_vslice_offset(i
+ off
)) = 0;
241 static void clear_vertical_d(void *vptr
, size_t off
, size_t len
)
243 for (size_t i
= 0; i
< len
; i
+= 8) {
244 *(uint64_t *)(vptr
+ tile_vslice_offset(i
+ off
)) = 0;
248 static void clear_vertical_q(void *vptr
, size_t off
, size_t len
)
250 for (size_t i
= 0; i
< len
; i
+= 16) {
251 memset(vptr
+ tile_vslice_offset(i
+ off
), 0, 16);
256 * Copy elements from an array into a tile slice comprising len bytes.
259 typedef void CopyFn(void *dst
, const void *src
, size_t len
);
261 static void copy_horizontal(void *dst
, const void *src
, size_t len
)
263 memcpy(dst
, src
, len
);
266 static void copy_vertical_b(void *vdst
, const void *vsrc
, size_t len
)
268 const uint8_t *src
= vsrc
;
272 for (i
= 0; i
< len
; ++i
) {
273 dst
[tile_vslice_index(i
)] = src
[i
];
277 static void copy_vertical_h(void *vdst
, const void *vsrc
, size_t len
)
279 const uint16_t *src
= vsrc
;
280 uint16_t *dst
= vdst
;
283 for (i
= 0; i
< len
/ 2; ++i
) {
284 dst
[tile_vslice_index(i
)] = src
[i
];
288 static void copy_vertical_s(void *vdst
, const void *vsrc
, size_t len
)
290 const uint32_t *src
= vsrc
;
291 uint32_t *dst
= vdst
;
294 for (i
= 0; i
< len
/ 4; ++i
) {
295 dst
[tile_vslice_index(i
)] = src
[i
];
299 static void copy_vertical_d(void *vdst
, const void *vsrc
, size_t len
)
301 const uint64_t *src
= vsrc
;
302 uint64_t *dst
= vdst
;
305 for (i
= 0; i
< len
/ 8; ++i
) {
306 dst
[tile_vslice_index(i
)] = src
[i
];
310 static void copy_vertical_q(void *vdst
, const void *vsrc
, size_t len
)
312 for (size_t i
= 0; i
< len
; i
+= 16) {
313 memcpy(vdst
+ tile_vslice_offset(i
), vsrc
+ i
, 16);
318 * Host and TLB primitives for vertical tile slice addressing.
321 #define DO_LD(NAME, TYPE, HOST, TLB) \
322 static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \
324 TYPE val = HOST(host); \
325 *(TYPE *)(za + tile_vslice_offset(off)) = val; \
327 static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \
328 intptr_t off, target_ulong addr, uintptr_t ra) \
330 TYPE val = TLB(env, useronly_clean_ptr(addr), ra); \
331 *(TYPE *)(za + tile_vslice_offset(off)) = val; \
334 #define DO_ST(NAME, TYPE, HOST, TLB) \
335 static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \
337 TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \
340 static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \
341 intptr_t off, target_ulong addr, uintptr_t ra) \
343 TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \
344 TLB(env, useronly_clean_ptr(addr), val, ra); \
348 * The ARMVectorReg elements are stored in host-endian 64-bit units.
349 * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
350 * corresponds to storing the two 64-bit pieces in little-endian order.
352 #define DO_LDQ(HNAME, VNAME, BE, HOST, TLB) \
353 static inline void HNAME##_host(void *za, intptr_t off, void *host) \
355 uint64_t val0 = HOST(host), val1 = HOST(host + 8); \
356 uint64_t *ptr = za + off; \
357 ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \
359 static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \
361 HNAME##_host(za, tile_vslice_offset(off), host); \
363 static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \
364 target_ulong addr, uintptr_t ra) \
366 uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra); \
367 uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra); \
368 uint64_t *ptr = za + off; \
369 ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \
371 static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \
372 target_ulong addr, uintptr_t ra) \
374 HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \
377 #define DO_STQ(HNAME, VNAME, BE, HOST, TLB) \
378 static inline void HNAME##_host(void *za, intptr_t off, void *host) \
380 uint64_t *ptr = za + off; \
381 HOST(host, ptr[BE]); \
382 HOST(host + 8, ptr[!BE]); \
384 static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \
386 HNAME##_host(za, tile_vslice_offset(off), host); \
388 static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \
389 target_ulong addr, uintptr_t ra) \
391 uint64_t *ptr = za + off; \
392 TLB(env, useronly_clean_ptr(addr), ptr[BE], ra); \
393 TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra); \
395 static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \
396 target_ulong addr, uintptr_t ra) \
398 HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \
401 DO_LD(ld1b
, uint8_t, ldub_p
, cpu_ldub_data_ra
)
402 DO_LD(ld1h_be
, uint16_t, lduw_be_p
, cpu_lduw_be_data_ra
)
403 DO_LD(ld1h_le
, uint16_t, lduw_le_p
, cpu_lduw_le_data_ra
)
404 DO_LD(ld1s_be
, uint32_t, ldl_be_p
, cpu_ldl_be_data_ra
)
405 DO_LD(ld1s_le
, uint32_t, ldl_le_p
, cpu_ldl_le_data_ra
)
406 DO_LD(ld1d_be
, uint64_t, ldq_be_p
, cpu_ldq_be_data_ra
)
407 DO_LD(ld1d_le
, uint64_t, ldq_le_p
, cpu_ldq_le_data_ra
)
409 DO_LDQ(sve_ld1qq_be
, sme_ld1q_be
, 1, ldq_be_p
, cpu_ldq_be_data_ra
)
410 DO_LDQ(sve_ld1qq_le
, sme_ld1q_le
, 0, ldq_le_p
, cpu_ldq_le_data_ra
)
412 DO_ST(st1b
, uint8_t, stb_p
, cpu_stb_data_ra
)
413 DO_ST(st1h_be
, uint16_t, stw_be_p
, cpu_stw_be_data_ra
)
414 DO_ST(st1h_le
, uint16_t, stw_le_p
, cpu_stw_le_data_ra
)
415 DO_ST(st1s_be
, uint32_t, stl_be_p
, cpu_stl_be_data_ra
)
416 DO_ST(st1s_le
, uint32_t, stl_le_p
, cpu_stl_le_data_ra
)
417 DO_ST(st1d_be
, uint64_t, stq_be_p
, cpu_stq_be_data_ra
)
418 DO_ST(st1d_le
, uint64_t, stq_le_p
, cpu_stq_le_data_ra
)
420 DO_STQ(sve_st1qq_be
, sme_st1q_be
, 1, stq_be_p
, cpu_stq_be_data_ra
)
421 DO_STQ(sve_st1qq_le
, sme_st1q_le
, 0, stq_le_p
, cpu_stq_le_data_ra
)
429 * Common helper for all contiguous predicated loads.
432 static inline QEMU_ALWAYS_INLINE
433 void sme_ld1(CPUARMState
*env
, void *za
, uint64_t *vg
,
434 const target_ulong addr
, uint32_t desc
, const uintptr_t ra
,
435 const int esz
, uint32_t mtedesc
, bool vertical
,
436 sve_ldst1_host_fn
*host_fn
,
437 sve_ldst1_tlb_fn
*tlb_fn
,
441 const intptr_t reg_max
= simd_oprsz(desc
);
442 const intptr_t esize
= 1 << esz
;
443 intptr_t reg_off
, reg_last
;
448 /* Find the active elements. */
449 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, esize
)) {
450 /* The entire predicate was false; no load occurs. */
451 clr_fn(za
, 0, reg_max
);
455 /* Probe the page(s). Exit with exception for any invalid page. */
456 sve_cont_ldst_pages(&info
, FAULT_ALL
, env
, addr
, MMU_DATA_LOAD
, ra
);
458 /* Handle watchpoints for all active elements. */
459 sve_cont_ldst_watchpoints(&info
, env
, vg
, addr
, esize
, esize
,
463 * Handle mte checks for all active elements.
464 * Since TBI must be set for MTE, !mtedesc => !mte_active.
467 sve_cont_ldst_mte_check(&info
, env
, vg
, addr
, esize
, esize
,
471 flags
= info
.page
[0].flags
| info
.page
[1].flags
;
472 if (unlikely(flags
!= 0)) {
473 #ifdef CONFIG_USER_ONLY
474 g_assert_not_reached();
477 * At least one page includes MMIO.
478 * Any bus operation can fail with cpu_transaction_failed,
479 * which for ARM will raise SyncExternal. Perform the load
480 * into scratch memory to preserve register state until the end.
482 ARMVectorReg scratch
= { };
484 reg_off
= info
.reg_off_first
[0];
485 reg_last
= info
.reg_off_last
[1];
487 reg_last
= info
.reg_off_split
;
489 reg_last
= info
.reg_off_last
[0];
494 uint64_t pg
= vg
[reg_off
>> 6];
496 if ((pg
>> (reg_off
& 63)) & 1) {
497 tlb_fn(env
, &scratch
, reg_off
, addr
+ reg_off
, ra
);
500 } while (reg_off
& 63);
501 } while (reg_off
<= reg_last
);
503 cpy_fn(za
, &scratch
, reg_max
);
508 /* The entire operation is in RAM, on valid pages. */
510 reg_off
= info
.reg_off_first
[0];
511 reg_last
= info
.reg_off_last
[0];
512 host
= info
.page
[0].host
;
515 memset(za
, 0, reg_max
);
516 } else if (reg_off
) {
517 clr_fn(za
, 0, reg_off
);
520 set_helper_retaddr(ra
);
522 while (reg_off
<= reg_last
) {
523 uint64_t pg
= vg
[reg_off
>> 6];
525 if ((pg
>> (reg_off
& 63)) & 1) {
526 host_fn(za
, reg_off
, host
+ reg_off
);
527 } else if (vertical
) {
528 clr_fn(za
, reg_off
, esize
);
531 } while (reg_off
<= reg_last
&& (reg_off
& 63));
534 clear_helper_retaddr();
537 * Use the slow path to manage the cross-page misalignment.
538 * But we know this is RAM and cannot trap.
540 reg_off
= info
.reg_off_split
;
541 if (unlikely(reg_off
>= 0)) {
542 tlb_fn(env
, za
, reg_off
, addr
+ reg_off
, ra
);
545 reg_off
= info
.reg_off_first
[1];
546 if (unlikely(reg_off
>= 0)) {
547 reg_last
= info
.reg_off_last
[1];
548 host
= info
.page
[1].host
;
550 set_helper_retaddr(ra
);
553 uint64_t pg
= vg
[reg_off
>> 6];
555 if ((pg
>> (reg_off
& 63)) & 1) {
556 host_fn(za
, reg_off
, host
+ reg_off
);
557 } else if (vertical
) {
558 clr_fn(za
, reg_off
, esize
);
561 } while (reg_off
& 63);
562 } while (reg_off
<= reg_last
);
564 clear_helper_retaddr();
568 static inline QEMU_ALWAYS_INLINE
569 void sme_ld1_mte(CPUARMState
*env
, void *za
, uint64_t *vg
,
570 target_ulong addr
, uint32_t desc
, uintptr_t ra
,
571 const int esz
, bool vertical
,
572 sve_ldst1_host_fn
*host_fn
,
573 sve_ldst1_tlb_fn
*tlb_fn
,
577 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
578 int bit55
= extract64(addr
, 55, 1);
580 /* Remove mtedesc from the normal sve descriptor. */
581 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
583 /* Perform gross MTE suppression early. */
584 if (!tbi_check(mtedesc
, bit55
) ||
585 tcma_check(mtedesc
, bit55
, allocation_tag_from_addr(addr
))) {
589 sme_ld1(env
, za
, vg
, addr
, desc
, ra
, esz
, mtedesc
, vertical
,
590 host_fn
, tlb_fn
, clr_fn
, cpy_fn
);
593 #define DO_LD(L, END, ESZ) \
594 void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg, \
595 target_ulong addr, uint32_t desc) \
597 sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \
598 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \
599 clear_horizontal, copy_horizontal); \
601 void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg, \
602 target_ulong addr, uint32_t desc) \
604 sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \
605 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \
606 clear_vertical_##L, copy_vertical_##L); \
608 void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
609 target_ulong addr, uint32_t desc) \
611 sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \
612 sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \
613 clear_horizontal, copy_horizontal); \
615 void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
616 target_ulong addr, uint32_t desc) \
618 sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \
619 sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \
620 clear_vertical_##L, copy_vertical_##L); \
630 DO_LD(q
, _be
, MO_128
)
631 DO_LD(q
, _le
, MO_128
)
636 * Common helper for all contiguous predicated stores.
639 static inline QEMU_ALWAYS_INLINE
640 void sme_st1(CPUARMState
*env
, void *za
, uint64_t *vg
,
641 const target_ulong addr
, uint32_t desc
, const uintptr_t ra
,
642 const int esz
, uint32_t mtedesc
, bool vertical
,
643 sve_ldst1_host_fn
*host_fn
,
644 sve_ldst1_tlb_fn
*tlb_fn
)
646 const intptr_t reg_max
= simd_oprsz(desc
);
647 const intptr_t esize
= 1 << esz
;
648 intptr_t reg_off
, reg_last
;
653 /* Find the active elements. */
654 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, esize
)) {
655 /* The entire predicate was false; no store occurs. */
659 /* Probe the page(s). Exit with exception for any invalid page. */
660 sve_cont_ldst_pages(&info
, FAULT_ALL
, env
, addr
, MMU_DATA_STORE
, ra
);
662 /* Handle watchpoints for all active elements. */
663 sve_cont_ldst_watchpoints(&info
, env
, vg
, addr
, esize
, esize
,
667 * Handle mte checks for all active elements.
668 * Since TBI must be set for MTE, !mtedesc => !mte_active.
671 sve_cont_ldst_mte_check(&info
, env
, vg
, addr
, esize
, esize
,
675 flags
= info
.page
[0].flags
| info
.page
[1].flags
;
676 if (unlikely(flags
!= 0)) {
677 #ifdef CONFIG_USER_ONLY
678 g_assert_not_reached();
681 * At least one page includes MMIO.
682 * Any bus operation can fail with cpu_transaction_failed,
683 * which for ARM will raise SyncExternal. We cannot avoid
684 * this fault and will leave with the store incomplete.
686 reg_off
= info
.reg_off_first
[0];
687 reg_last
= info
.reg_off_last
[1];
689 reg_last
= info
.reg_off_split
;
691 reg_last
= info
.reg_off_last
[0];
696 uint64_t pg
= vg
[reg_off
>> 6];
698 if ((pg
>> (reg_off
& 63)) & 1) {
699 tlb_fn(env
, za
, reg_off
, addr
+ reg_off
, ra
);
702 } while (reg_off
& 63);
703 } while (reg_off
<= reg_last
);
708 reg_off
= info
.reg_off_first
[0];
709 reg_last
= info
.reg_off_last
[0];
710 host
= info
.page
[0].host
;
712 set_helper_retaddr(ra
);
714 while (reg_off
<= reg_last
) {
715 uint64_t pg
= vg
[reg_off
>> 6];
717 if ((pg
>> (reg_off
& 63)) & 1) {
718 host_fn(za
, reg_off
, host
+ reg_off
);
721 } while (reg_off
<= reg_last
&& (reg_off
& 63));
724 clear_helper_retaddr();
727 * Use the slow path to manage the cross-page misalignment.
728 * But we know this is RAM and cannot trap.
730 reg_off
= info
.reg_off_split
;
731 if (unlikely(reg_off
>= 0)) {
732 tlb_fn(env
, za
, reg_off
, addr
+ reg_off
, ra
);
735 reg_off
= info
.reg_off_first
[1];
736 if (unlikely(reg_off
>= 0)) {
737 reg_last
= info
.reg_off_last
[1];
738 host
= info
.page
[1].host
;
740 set_helper_retaddr(ra
);
743 uint64_t pg
= vg
[reg_off
>> 6];
745 if ((pg
>> (reg_off
& 63)) & 1) {
746 host_fn(za
, reg_off
, host
+ reg_off
);
749 } while (reg_off
& 63);
750 } while (reg_off
<= reg_last
);
752 clear_helper_retaddr();
756 static inline QEMU_ALWAYS_INLINE
757 void sme_st1_mte(CPUARMState
*env
, void *za
, uint64_t *vg
, target_ulong addr
,
758 uint32_t desc
, uintptr_t ra
, int esz
, bool vertical
,
759 sve_ldst1_host_fn
*host_fn
,
760 sve_ldst1_tlb_fn
*tlb_fn
)
762 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
763 int bit55
= extract64(addr
, 55, 1);
765 /* Remove mtedesc from the normal sve descriptor. */
766 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
768 /* Perform gross MTE suppression early. */
769 if (!tbi_check(mtedesc
, bit55
) ||
770 tcma_check(mtedesc
, bit55
, allocation_tag_from_addr(addr
))) {
774 sme_st1(env
, za
, vg
, addr
, desc
, ra
, esz
, mtedesc
,
775 vertical
, host_fn
, tlb_fn
);
778 #define DO_ST(L, END, ESZ) \
779 void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg, \
780 target_ulong addr, uint32_t desc) \
782 sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \
783 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \
785 void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg, \
786 target_ulong addr, uint32_t desc) \
788 sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \
789 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \
791 void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
792 target_ulong addr, uint32_t desc) \
794 sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \
795 sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \
797 void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
798 target_ulong addr, uint32_t desc) \
800 sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \
801 sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \
811 DO_ST(q
, _be
, MO_128
)
812 DO_ST(q
, _le
, MO_128
)
816 void HELPER(sme_addha_s
)(void *vzda
, void *vzn
, void *vpn
,
817 void *vpm
, uint32_t desc
)
819 intptr_t row
, col
, oprsz
= simd_oprsz(desc
) / 4;
820 uint64_t *pn
= vpn
, *pm
= vpm
;
821 uint32_t *zda
= vzda
, *zn
= vzn
;
823 for (row
= 0; row
< oprsz
; ) {
824 uint64_t pa
= pn
[row
>> 4];
827 for (col
= 0; col
< oprsz
; ) {
828 uint64_t pb
= pm
[col
>> 4];
831 zda
[tile_vslice_index(row
) + H4(col
)] += zn
[H4(col
)];
834 } while (++col
& 15);
838 } while (++row
& 15);
842 void HELPER(sme_addha_d
)(void *vzda
, void *vzn
, void *vpn
,
843 void *vpm
, uint32_t desc
)
845 intptr_t row
, col
, oprsz
= simd_oprsz(desc
) / 8;
846 uint8_t *pn
= vpn
, *pm
= vpm
;
847 uint64_t *zda
= vzda
, *zn
= vzn
;
849 for (row
= 0; row
< oprsz
; ++row
) {
850 if (pn
[H1(row
)] & 1) {
851 for (col
= 0; col
< oprsz
; ++col
) {
852 if (pm
[H1(col
)] & 1) {
853 zda
[tile_vslice_index(row
) + col
] += zn
[col
];
860 void HELPER(sme_addva_s
)(void *vzda
, void *vzn
, void *vpn
,
861 void *vpm
, uint32_t desc
)
863 intptr_t row
, col
, oprsz
= simd_oprsz(desc
) / 4;
864 uint64_t *pn
= vpn
, *pm
= vpm
;
865 uint32_t *zda
= vzda
, *zn
= vzn
;
867 for (row
= 0; row
< oprsz
; ) {
868 uint64_t pa
= pn
[row
>> 4];
871 uint32_t zn_row
= zn
[H4(row
)];
872 for (col
= 0; col
< oprsz
; ) {
873 uint64_t pb
= pm
[col
>> 4];
876 zda
[tile_vslice_index(row
) + H4(col
)] += zn_row
;
879 } while (++col
& 15);
883 } while (++row
& 15);
887 void HELPER(sme_addva_d
)(void *vzda
, void *vzn
, void *vpn
,
888 void *vpm
, uint32_t desc
)
890 intptr_t row
, col
, oprsz
= simd_oprsz(desc
) / 8;
891 uint8_t *pn
= vpn
, *pm
= vpm
;
892 uint64_t *zda
= vzda
, *zn
= vzn
;
894 for (row
= 0; row
< oprsz
; ++row
) {
895 if (pn
[H1(row
)] & 1) {
896 uint64_t zn_row
= zn
[row
];
897 for (col
= 0; col
< oprsz
; ++col
) {
898 if (pm
[H1(col
)] & 1) {
899 zda
[tile_vslice_index(row
) + col
] += zn_row
;
906 void HELPER(sme_fmopa_s
)(void *vza
, void *vzn
, void *vzm
, void *vpn
,
907 void *vpm
, void *vst
, uint32_t desc
)
909 intptr_t row
, col
, oprsz
= simd_maxsz(desc
);
910 uint32_t neg
= simd_data(desc
) << 31;
911 uint16_t *pn
= vpn
, *pm
= vpm
;
915 * Make a copy of float_status because this operation does not
916 * update the cumulative fp exception status. It also produces
919 fpst
= *(float_status
*)vst
;
920 set_default_nan_mode(true, &fpst
);
922 for (row
= 0; row
< oprsz
; ) {
923 uint16_t pa
= pn
[H2(row
>> 4)];
926 void *vza_row
= vza
+ tile_vslice_offset(row
);
927 uint32_t n
= *(uint32_t *)(vzn
+ H1_4(row
)) ^ neg
;
929 for (col
= 0; col
< oprsz
; ) {
930 uint16_t pb
= pm
[H2(col
>> 4)];
933 uint32_t *a
= vza_row
+ H1_4(col
);
934 uint32_t *m
= vzm
+ H1_4(col
);
935 *a
= float32_muladd(n
, *m
, *a
, 0, &fpst
);
948 void HELPER(sme_fmopa_d
)(void *vza
, void *vzn
, void *vzm
, void *vpn
,
949 void *vpm
, void *vst
, uint32_t desc
)
951 intptr_t row
, col
, oprsz
= simd_oprsz(desc
) / 8;
952 uint64_t neg
= (uint64_t)simd_data(desc
) << 63;
953 uint64_t *za
= vza
, *zn
= vzn
, *zm
= vzm
;
954 uint8_t *pn
= vpn
, *pm
= vpm
;
955 float_status fpst
= *(float_status
*)vst
;
957 set_default_nan_mode(true, &fpst
);
959 for (row
= 0; row
< oprsz
; ++row
) {
960 if (pn
[H1(row
)] & 1) {
961 uint64_t *za_row
= &za
[tile_vslice_index(row
)];
962 uint64_t n
= zn
[row
] ^ neg
;
964 for (col
= 0; col
< oprsz
; ++col
) {
965 if (pm
[H1(col
)] & 1) {
966 uint64_t *a
= &za_row
[col
];
967 *a
= float64_muladd(n
, zm
[col
], *a
, 0, &fpst
);
975 * Alter PAIR as needed for controlling predicates being false,
976 * and for NEG on an enabled row element.
978 static inline uint32_t f16mop_adj_pair(uint32_t pair
, uint32_t pg
, uint32_t neg
)
981 * The pseudocode uses a conditional negate after the conditional zero.
982 * It is simpler here to unconditionally negate before conditional zero.
994 static float32
f16_dotadd(float32 sum
, uint32_t e1
, uint32_t e2
,
995 float_status
*s_f16
, float_status
*s_std
,
999 * We need three different float_status for different parts of this
1001 * - the input conversion of the float16 values must use the
1002 * f16-specific float_status, so that the FPCR.FZ16 control is applied
1003 * - operations on float32 including the final accumulation must use
1004 * the normal float_status, so that FPCR.FZ is applied
1005 * - we have pre-set-up copy of s_std which is set to round-to-odd,
1006 * for the multiply (see below)
1008 float64 e1r
= float16_to_float64(e1
& 0xffff, true, s_f16
);
1009 float64 e1c
= float16_to_float64(e1
>> 16, true, s_f16
);
1010 float64 e2r
= float16_to_float64(e2
& 0xffff, true, s_f16
);
1011 float64 e2c
= float16_to_float64(e2
>> 16, true, s_f16
);
1016 * The ARM pseudocode function FPDot performs both multiplies
1017 * and the add with a single rounding operation. Emulate this
1018 * by performing the first multiply in round-to-odd, then doing
1019 * the second multiply as fused multiply-add, and rounding to
1020 * float32 all in one step.
1022 t64
= float64_mul(e1r
, e2r
, s_odd
);
1023 t64
= float64r32_muladd(e1c
, e2c
, t64
, 0, s_std
);
1025 /* This conversion is exact, because we've already rounded. */
1026 t32
= float64_to_float32(t64
, s_std
);
1028 /* The final accumulation step is not fused. */
1029 return float32_add(sum
, t32
, s_std
);
1032 void HELPER(sme_fmopa_h
)(void *vza
, void *vzn
, void *vzm
, void *vpn
,
1033 void *vpm
, CPUARMState
*env
, uint32_t desc
)
1035 intptr_t row
, col
, oprsz
= simd_maxsz(desc
);
1036 uint32_t neg
= simd_data(desc
) * 0x80008000u
;
1037 uint16_t *pn
= vpn
, *pm
= vpm
;
1038 float_status fpst_odd
, fpst_std
, fpst_f16
;
1041 * Make copies of fp_status and fp_status_f16, because this operation
1042 * does not update the cumulative fp exception status. It also
1043 * produces default NaNs. We also need a second copy of fp_status with
1044 * round-to-odd -- see above.
1046 fpst_f16
= env
->vfp
.fp_status_f16
;
1047 fpst_std
= env
->vfp
.fp_status
;
1048 set_default_nan_mode(true, &fpst_std
);
1049 set_default_nan_mode(true, &fpst_f16
);
1050 fpst_odd
= fpst_std
;
1051 set_float_rounding_mode(float_round_to_odd
, &fpst_odd
);
1053 for (row
= 0; row
< oprsz
; ) {
1054 uint16_t prow
= pn
[H2(row
>> 4)];
1056 void *vza_row
= vza
+ tile_vslice_offset(row
);
1057 uint32_t n
= *(uint32_t *)(vzn
+ H1_4(row
));
1059 n
= f16mop_adj_pair(n
, prow
, neg
);
1061 for (col
= 0; col
< oprsz
; ) {
1062 uint16_t pcol
= pm
[H2(col
>> 4)];
1064 if (prow
& pcol
& 0b0101) {
1065 uint32_t *a
= vza_row
+ H1_4(col
);
1066 uint32_t m
= *(uint32_t *)(vzm
+ H1_4(col
));
1068 m
= f16mop_adj_pair(m
, pcol
, 0);
1069 *a
= f16_dotadd(*a
, n
, m
,
1070 &fpst_f16
, &fpst_std
, &fpst_odd
);
1082 void HELPER(sme_bfmopa
)(void *vza
, void *vzn
, void *vzm
,
1083 void *vpn
, void *vpm
, CPUARMState
*env
, uint32_t desc
)
1085 intptr_t row
, col
, oprsz
= simd_maxsz(desc
);
1086 uint32_t neg
= simd_data(desc
) * 0x80008000u
;
1087 uint16_t *pn
= vpn
, *pm
= vpm
;
1088 float_status fpst
, fpst_odd
;
1090 if (is_ebf(env
, &fpst
, &fpst_odd
)) {
1091 for (row
= 0; row
< oprsz
; ) {
1092 uint16_t prow
= pn
[H2(row
>> 4)];
1094 void *vza_row
= vza
+ tile_vslice_offset(row
);
1095 uint32_t n
= *(uint32_t *)(vzn
+ H1_4(row
));
1097 n
= f16mop_adj_pair(n
, prow
, neg
);
1099 for (col
= 0; col
< oprsz
; ) {
1100 uint16_t pcol
= pm
[H2(col
>> 4)];
1102 if (prow
& pcol
& 0b0101) {
1103 uint32_t *a
= vza_row
+ H1_4(col
);
1104 uint32_t m
= *(uint32_t *)(vzm
+ H1_4(col
));
1106 m
= f16mop_adj_pair(m
, pcol
, 0);
1107 *a
= bfdotadd_ebf(*a
, n
, m
, &fpst
, &fpst_odd
);
1118 for (row
= 0; row
< oprsz
; ) {
1119 uint16_t prow
= pn
[H2(row
>> 4)];
1121 void *vza_row
= vza
+ tile_vslice_offset(row
);
1122 uint32_t n
= *(uint32_t *)(vzn
+ H1_4(row
));
1124 n
= f16mop_adj_pair(n
, prow
, neg
);
1126 for (col
= 0; col
< oprsz
; ) {
1127 uint16_t pcol
= pm
[H2(col
>> 4)];
1129 if (prow
& pcol
& 0b0101) {
1130 uint32_t *a
= vza_row
+ H1_4(col
);
1131 uint32_t m
= *(uint32_t *)(vzm
+ H1_4(col
));
1133 m
= f16mop_adj_pair(m
, pcol
, 0);
1134 *a
= bfdotadd(*a
, n
, m
, &fpst
);
1147 typedef uint32_t IMOPFn32(uint32_t, uint32_t, uint32_t, uint8_t, bool);
1148 static inline void do_imopa_s(uint32_t *za
, uint32_t *zn
, uint32_t *zm
,
1149 uint8_t *pn
, uint8_t *pm
,
1150 uint32_t desc
, IMOPFn32
*fn
)
1152 intptr_t row
, col
, oprsz
= simd_oprsz(desc
) / 4;
1153 bool neg
= simd_data(desc
);
1155 for (row
= 0; row
< oprsz
; ++row
) {
1156 uint8_t pa
= (pn
[H1(row
>> 1)] >> ((row
& 1) * 4)) & 0xf;
1157 uint32_t *za_row
= &za
[tile_vslice_index(row
)];
1158 uint32_t n
= zn
[H4(row
)];
1160 for (col
= 0; col
< oprsz
; ++col
) {
1161 uint8_t pb
= pm
[H1(col
>> 1)] >> ((col
& 1) * 4);
1162 uint32_t *a
= &za_row
[H4(col
)];
1164 *a
= fn(n
, zm
[H4(col
)], *a
, pa
& pb
, neg
);
1169 typedef uint64_t IMOPFn64(uint64_t, uint64_t, uint64_t, uint8_t, bool);
1170 static inline void do_imopa_d(uint64_t *za
, uint64_t *zn
, uint64_t *zm
,
1171 uint8_t *pn
, uint8_t *pm
,
1172 uint32_t desc
, IMOPFn64
*fn
)
1174 intptr_t row
, col
, oprsz
= simd_oprsz(desc
) / 8;
1175 bool neg
= simd_data(desc
);
1177 for (row
= 0; row
< oprsz
; ++row
) {
1178 uint8_t pa
= pn
[H1(row
)];
1179 uint64_t *za_row
= &za
[tile_vslice_index(row
)];
1180 uint64_t n
= zn
[row
];
1182 for (col
= 0; col
< oprsz
; ++col
) {
1183 uint8_t pb
= pm
[H1(col
)];
1184 uint64_t *a
= &za_row
[col
];
1186 *a
= fn(n
, zm
[col
], *a
, pa
& pb
, neg
);
1191 #define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
1192 static uint32_t NAME(uint32_t n, uint32_t m, uint32_t a, uint8_t p, bool neg) \
1195 /* Apply P to N as a mask, making the inactive elements 0. */ \
1196 n &= expand_pred_b(p); \
1197 sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \
1198 sum += (NTYPE)(n >> 8) * (MTYPE)(m >> 8); \
1199 sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \
1200 sum += (NTYPE)(n >> 24) * (MTYPE)(m >> 24); \
1201 return neg ? a - sum : a + sum; \
1204 #define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
1205 static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1208 /* Apply P to N as a mask, making the inactive elements 0. */ \
1209 n &= expand_pred_h(p); \
1210 sum += (int64_t)(NTYPE)(n >> 0) * (MTYPE)(m >> 0); \
1211 sum += (int64_t)(NTYPE)(n >> 16) * (MTYPE)(m >> 16); \
1212 sum += (int64_t)(NTYPE)(n >> 32) * (MTYPE)(m >> 32); \
1213 sum += (int64_t)(NTYPE)(n >> 48) * (MTYPE)(m >> 48); \
1214 return neg ? a - sum : a + sum; \
1217 DEF_IMOP_32(smopa_s
, int8_t, int8_t)
1218 DEF_IMOP_32(umopa_s
, uint8_t, uint8_t)
1219 DEF_IMOP_32(sumopa_s
, int8_t, uint8_t)
1220 DEF_IMOP_32(usmopa_s
, uint8_t, int8_t)
1222 DEF_IMOP_64(smopa_d
, int16_t, int16_t)
1223 DEF_IMOP_64(umopa_d
, uint16_t, uint16_t)
1224 DEF_IMOP_64(sumopa_d
, int16_t, uint16_t)
1225 DEF_IMOP_64(usmopa_d
, uint16_t, int16_t)
1227 #define DEF_IMOPH(NAME, S) \
1228 void HELPER(sme_##NAME##_##S)(void *vza, void *vzn, void *vzm, \
1229 void *vpn, void *vpm, uint32_t desc) \
1230 { do_imopa_##S(vza, vzn, vzm, vpn, vpm, desc, NAME##_##S); }
1234 DEF_IMOPH(sumopa
, s
)
1235 DEF_IMOPH(usmopa
, s
)
1239 DEF_IMOPH(sumopa
, d
)
1240 DEF_IMOPH(usmopa
, d
)