clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl

   1 // REQUIRES: amdgpu-registered-target
   2 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga -emit-llvm -o - %s | FileCheck %s
   3 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s
   4 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
   5 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck %s
   6 // RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck %s
   7
   8 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
   9
  10 typedef unsigned long ulong;
  11 typedef unsigned int  uint;
  12
  13 // CHECK-LABEL: @test_div_fixup_f16
  14 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.div.fixup.f16
  15 void test_div_fixup_f16(global half* out, half a, half b, half c)
  16 {
  17   *out = __builtin_amdgcn_div_fixuph(a, b, c);
  18 }
  19
  20 // CHECK-LABEL: @test_rcp_f16
  21 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.rcp.f16
  22 void test_rcp_f16(global half* out, half a)
  23 {
  24   *out = __builtin_amdgcn_rcph(a);
  25 }
  26
  27 // CHECK-LABEL: @test_sqrt_f16
  28 // CHECK: {{.*}}call{{.*}} half @llvm.{{((amdgcn.){0,1})}}sqrt.f16
  29 void test_sqrt_f16(global half* out, half a)
  30 {
  31   *out = __builtin_amdgcn_sqrth(a);
  32 }
  33
  34 // CHECK-LABEL: @test_rsq_f16
  35 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.rsq.f16
  36 void test_rsq_f16(global half* out, half a)
  37 {
  38   *out = __builtin_amdgcn_rsqh(a);
  39 }
  40
  41 // CHECK-LABEL: @test_sin_f16
  42 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.sin.f16
  43 void test_sin_f16(global half* out, half a)
  44 {
  45   *out = __builtin_amdgcn_sinh(a);
  46 }
  47
  48 // CHECK-LABEL: @test_cos_f16
  49 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.cos.f16
  50 void test_cos_f16(global half* out, half a)
  51 {
  52   *out = __builtin_amdgcn_cosh(a);
  53 }
  54
  55 // CHECK-LABEL: @test_ldexp_f16
  56 // CHECK: [[TRUNC:%[0-9a-z]+]] = trunc i32
  57 // CHECK: {{.*}}call{{.*}} half @llvm.ldexp.f16.i16(half %a, i16 [[TRUNC]])
  58 void test_ldexp_f16(global half* out, half a, int b)
  59 {
  60   *out = __builtin_amdgcn_ldexph(a, b);
  61 }
  62
  63 // CHECK-LABEL: @test_frexp_mant_f16
  64 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.frexp.mant.f16
  65 void test_frexp_mant_f16(global half* out, half a)
  66 {
  67   *out = __builtin_amdgcn_frexp_manth(a);
  68 }
  69
  70 // CHECK-LABEL: @test_frexp_exp_f16
  71 // CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.frexp.exp.i16.f16
  72 void test_frexp_exp_f16(global short* out, half a)
  73 {
  74   *out = __builtin_amdgcn_frexp_exph(a);
  75 }
  76
  77 // CHECK-LABEL: @test_fract_f16
  78 // CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.fract.f16
  79 void test_fract_f16(global half* out, half a)
  80 {
  81   *out = __builtin_amdgcn_fracth(a);
  82 }
  83
  84 // CHECK-LABEL: @test_class_f16
  85 // CHECK: {{.*}}call{{.*}} i1 @llvm.amdgcn.class.f16
  86 void test_class_f16(global half* out, half a, int b)
  87 {
  88   *out = __builtin_amdgcn_classh(a, b);
  89 }
  90
  91 // CHECK-LABEL: @test_s_memrealtime
  92 // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.s.memrealtime()
  93 void test_s_memrealtime(global ulong* out)
  94 {
  95   *out = __builtin_amdgcn_s_memrealtime();
  96 }
  97
  98 // CHECK-LABEL: @test_s_dcache_wb()
  99 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.dcache.wb()
 100 void test_s_dcache_wb()
 101 {
 102   __builtin_amdgcn_s_dcache_wb();
 103 }
 104
 105 // CHECK-LABEL: @test_mov_dpp_int
 106 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %src, i32 0, i32 0, i32 0, i1 false)
 107 void test_mov_dpp_int(global int* out, int src)
 108 {
 109   *out = __builtin_amdgcn_mov_dpp(src, 0, 0, 0, false);
 110 }
 111
 112 // CHECK-LABEL: @test_mov_dpp_long
 113 // CHECK:      %0 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %x, i32 257, i32 15, i32 15, i1 false)
 114 // CHECK-NEXT: store i64 %0,
 115 void test_mov_dpp_long(long x, global long *p) {
 116   *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 117 }
 118
 119 // CHECK-LABEL: @test_mov_dpp_float
 120 // CHECK:      %0 = bitcast float %x to i32
 121 // CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
 122 // CHECK-NEXT: store i32 %1,
 123 void test_mov_dpp_float(float x, global float *p) {
 124   *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 125 }
 126
 127 // CHECK-LABEL: @test_mov_dpp_double
 128 // CHECK:      %0 = bitcast double %x to i64
 129 // CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %0, i32 257, i32 15, i32 15, i1 false)
 130 // CHECK-NEXT: store i64 %1,
 131 void test_mov_dpp_double(double x, global double *p) {
 132   *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 133 }
 134
 135 // CHECK-LABEL: @test_mov_dpp_short
 136 // CHECK:      %0 = zext i16 %x to i32
 137 // CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
 138 // CHECK-NEXT: %2 = trunc i32 %1 to i16
 139 // CHECK-NEXT: store i16 %2,
 140 void test_mov_dpp_short(short x, global short *p) {
 141   *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 142 }
 143
 144 // CHECK-LABEL: @test_mov_dpp_char
 145 // CHECK:      %0 = zext i8 %x to i32
 146 // CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false)
 147 // CHECK-NEXT: %2 = trunc i32 %1 to i8
 148 // CHECK-NEXT: store i8 %2,
 149 void test_mov_dpp_char(char x, global char *p) {
 150   *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0);
 151 }
 152
 153 // CHECK-LABEL: @test_mov_dpp_half
 154 // CHECK:      %0 = load i16,
 155 // CHECK:      %1 = zext i16 %0 to i32
 156 // CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %1, i32 257, i32 15, i32 15, i1 false)
 157 // CHECK-NEXT: %3 = trunc i32 %2 to i16
 158 // CHECK-NEXT: store i16 %3,
 159 void test_mov_dpp_half(half *x, global half *p) {
 160   *p = __builtin_amdgcn_mov_dpp(*x, 0x101, 0xf, 0xf, 0);
 161 }
 162
 163 // CHECK-LABEL: @test_update_dpp_int
 164 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %arg1, i32 %arg2, i32 0, i32 0, i32 0, i1 false)
 165 void test_update_dpp_int(global int* out, int arg1, int arg2)
 166 {
 167   *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0, false);
 168 }
 169
 170 // CHECK-LABEL: @test_update_dpp_long
 171 // CHECK:      %0 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %x, i32 257, i32 15, i32 15, i1 false)
 172 // CHECk-NEXT: store i64 %0,
 173 void test_update_dpp_long(long x, global long *p) {
 174   *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 175 }
 176
 177 // CHECK-LABEL: @test_update_dpp_float
 178 // CHECK:      %0 = bitcast float %x to i32
 179 // CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
 180 // CHECK-NEXT: store i32 %1,
 181 void test_update_dpp_float(float x, global float *p) {
 182   *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 183 }
 184
 185 // CHECK-LABEL: @test_update_dpp_double
 186 // CHECK:      %0 = bitcast double %x to i64
 187 // CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %0, i64 %0, i32 257, i32 15, i32 15, i1 false)
 188 // CHECK-NEXT: store i64 %1,
 189 void test_update_dpp_double(double x, global double *p) {
 190   *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 191 }
 192
 193 // CHECK-LABEL: @test_update_dpp_short
 194 // CHECK:      %0 = zext i16 %x to i32
 195 // CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
 196 // CHECK-NEXT: %2 = trunc i32 %1 to i16
 197 // CHECK-NEXT: store i16 %2,
 198 void test_update_dpp_short(short x, global short *p) {
 199   *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 200 }
 201
 202 // CHECK-LABEL: @test_update_dpp_char
 203 // CHECK:      %0 = zext i8 %x to i32
 204 // CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false)
 205 // CHECK-NEXT: %2 = trunc i32 %1 to i8
 206 // CHECK-NEXT: store i8 %2,
 207 void test_update_dpp_char(char x, global char *p) {
 208   *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0);
 209 }
 210
 211 // CHECK-LABEL: @test_update_dpp_half
 212 // CHECK:      %0 = load i16,
 213 // CHECK:      %1 = zext i16 %0 to i32
 214 // CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %1, i32 257, i32 15, i32 15, i1 false)
 215 // CHECK-NEXT: %3 = trunc i32 %2 to i16
 216 // CHECK-NEXT: store i16 %3,
 217 void test_update_dpp_half(half *x, global half *p) {
 218   *p = __builtin_amdgcn_update_dpp(*x, *x, 0x101, 0xf, 0xf, 0);
 219 }
 220
 221 // CHECK-LABEL: @test_update_dpp_int_uint
 222 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %arg1, i32 %arg2, i32 0, i32 0, i32 0, i1 false)
 223 void test_update_dpp_int_uint(global int* out, int arg1, unsigned int arg2)
 224 {
 225   *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0, false);
 226 }
 227
 228 // CHECK-LABEL: @test_update_dpp_lit_int
 229 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 5, i32 %arg1, i32 0, i32 0, i32 0, i1 false)
 230 void test_update_dpp_lit_int(global int* out, int arg1)
 231 {
 232   *out = __builtin_amdgcn_update_dpp(5, arg1, 0, 0, 0, false);
 233 }
 234
 235 __constant int gi = 5;
 236
 237 // CHECK-LABEL: @test_update_dpp_const_int
 238 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 5, i32 %arg1, i32 0, i32 0, i32 0, i1 false)
 239 void test_update_dpp_const_int(global int* out, int arg1)
 240 {
 241   *out = __builtin_amdgcn_update_dpp(gi, arg1, 0, 0, 0, false);
 242 }
 243
 244 // CHECK-LABEL: @test_ds_fadd
 245 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 246 // CHECK: atomicrmw volatile fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 247
 248 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src acquire, align 4{{$}}
 249 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src acquire, align 4{{$}}
 250 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src release, align 4{{$}}
 251 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src acq_rel, align 4{{$}}
 252 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 253 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 254
 255 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
 256 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
 257 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
 258 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
 259 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 260 #if !defined(__SPIRV__)
 261 void test_ds_faddf(local float *out, float src) {
 262 #else
 263   void test_ds_faddf(__attribute__((address_space(3))) float *out, float src) {
 264 #endif
 265
 266   *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM, false);
 267   *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM, true);
 268
 269   // Test all orders.
 270   *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_CONSUME, __MEMORY_SCOPE_SYSTEM, false);
 271   *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_SYSTEM, false);
 272   *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM, false);
 273   *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_ACQ_REL, __MEMORY_SCOPE_SYSTEM, false);
 274   *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false);
 275   *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false); // invalid
 276
 277   // Test all syncscopes.
 278   *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false);
 279   *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false);
 280   *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false);
 281   *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false);
 282   *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, 5, false); // invalid
 283 }
 284
 285 // CHECK-LABEL: @test_ds_fmin
 286 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 287 // CHECK: atomicrmw volatile fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 288
 289 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src acquire, align 4{{$}}
 290 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src acquire, align 4{{$}}
 291 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src release, align 4{{$}}
 292 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src acq_rel, align 4{{$}}
 293 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 294 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 295
 296 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
 297 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
 298 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
 299 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
 300 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 301
 302 #if !defined(__SPIRV__)
 303 void test_ds_fminf(local float *out, float src) {
 304 #else
 305 void test_ds_fminf(__attribute__((address_space(3))) float *out, float src) {
 306 #endif
 307   *out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, false);
 308   *out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, true);
 309
 310   // Test all orders.
 311   *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_CONSUME, __MEMORY_SCOPE_SYSTEM, false);
 312   *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_SYSTEM, false);
 313   *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM, false);
 314   *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_ACQ_REL, __MEMORY_SCOPE_SYSTEM, false);
 315   *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false);
 316   *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false); // invalid
 317
 318   // Test all syncscopes.
 319   *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false);
 320   *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false);
 321   *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false);
 322   *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false);
 323   *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, 5, false); // invalid
 324 }
 325
 326 // CHECK-LABEL: @test_ds_fmax
 327 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 328 // CHECK: atomicrmw volatile fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 329
 330 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src acquire, align 4{{$}}
 331 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src acquire, align 4{{$}}
 332 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src release, align 4{{$}}
 333 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src acq_rel, align 4{{$}}
 334 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 335 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 336
 337 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
 338 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
 339 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
 340 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
 341 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 342
 343 #if !defined(__SPIRV__)
 344 void test_ds_fmaxf(local float *out, float src) {
 345 #else
 346 void test_ds_fmaxf(__attribute__((address_space(3))) float *out, float src) {
 347 #endif
 348   *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, false);
 349   *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, true);
 350
 351   // Test all orders.
 352   *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_CONSUME, __MEMORY_SCOPE_SYSTEM, false);
 353   *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_SYSTEM, false);
 354   *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM, false);
 355   *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_ACQ_REL, __MEMORY_SCOPE_SYSTEM, false);
 356   *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false);
 357   *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false); // invalid
 358
 359   // Test all syncscopes.
 360   *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false);
 361   *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false);
 362   *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false);
 363   *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false);
 364   *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, 5, false); // invalid
 365 }
 366
 367 // CHECK-LABEL: @test_s_memtime
 368 // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.s.memtime()
 369 void test_s_memtime(global ulong* out)
 370 {
 371   *out = __builtin_amdgcn_s_memtime();
 372 }
 373
 374 // CHECK-LABEL: @test_perm
 375 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.perm(i32 %a, i32 %b, i32 %s)
 376 void test_perm(global uint* out, uint a, uint b, uint s)
 377 {
 378   *out = __builtin_amdgcn_perm(a, b, s);
 379 }
 380
 381 // CHECK-LABEL: @test_groupstaticsize
 382 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.groupstaticsize()
 383 void test_groupstaticsize(global uint* out)
 384 {
 385   *out = __builtin_amdgcn_groupstaticsize();
 386 }