Index: test/CodeGen/AMDGPU/amdgcn.bitcast.ll =================================================================== --- test/CodeGen/AMDGPU/amdgcn.bitcast.ll +++ test/CodeGen/AMDGPU/amdgcn.bitcast.ll @@ -3,19 +3,15 @@ ; This test just checks that the compiler doesn't crash. -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - ; FUNC-LABEL: {{^}}v32i8_to_v8i32: -; SI: s_endpgm -define amdgpu_ps void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 { +define amdgpu_ps float @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 { entry: %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0 %2 = bitcast <32 x i8> %1 to <8 x i32> %3 = extractelement <8 x i32> %2, i32 1 %4 = icmp ne i32 %3, 0 %5 = select i1 %4, float 0.0, float 1.0 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5) - ret void + ret float %5 } ; FUNC-LABEL: {{^}}i8ptr_v16i8ptr: Index: test/CodeGen/AMDGPU/commute-shifts.ll =================================================================== --- test/CodeGen/AMDGPU/commute-shifts.ll +++ test/CodeGen/AMDGPU/commute-shifts.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}main: ; SI: v_lshl_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} ; VI: v_lshlrev_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 1 -define amdgpu_ps void @main(float %arg0, float %arg1) #0 { +define amdgpu_ps float @main(float %arg0, float %arg1) #0 { bb: %tmp = fptosi float %arg0 to i32 %tmp1 = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) @@ -17,13 +17,11 @@ %tmp7 = select i1 %tmp6, float 0.000000e+00, float %arg1 %tmp8 = call i32 @llvm.SI.packf16(float undef, float %tmp7) %tmp9 = bitcast i32 %tmp8 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %tmp9, float undef, float %tmp9) - ret void + ret float %tmp9 } declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 declare i32 @llvm.SI.packf16(float, float) #1 -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/default-fp-mode.ll =================================================================== --- test/CodeGen/AMDGPU/default-fp-mode.ll +++ test/CodeGen/AMDGPU/default-fp-mode.ll @@ -97,18 +97,15 @@ ; GCN-LABEL: {{^}}kill_vcc_implicit_def: ; GCN: IeeeMode: 0 -define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) { +define amdgpu_ps float @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) { entry: %tmp0 = fcmp olt float %13, 0.0 call void @llvm.AMDGPU.kill(float %14) %tmp1 = select i1 %tmp0, float 1.0, float 0.0 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 1, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1) - ret void + ret float %tmp1 } - declare void @llvm.AMDGPU.kill(float) -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) attributes #0 = { nounwind "target-cpu"="tahiti" } attributes #1 = { nounwind "target-cpu"="fiji" } Index: test/CodeGen/AMDGPU/elf.ll =================================================================== --- test/CodeGen/AMDGPU/elf.ll +++ test/CodeGen/AMDGPU/elf.ll @@ -24,11 +24,13 @@ ; TONGA-NEXT: .long 704 ; CONFIG: .p2align 8 ; CONFIG: test: -define amdgpu_ps void @test(i32 %p) { +define amdgpu_ps void @test(i32 %p) #0 { %i = add i32 %p, 2 %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r, float %r, float %r, float %r, i1 true, i1 false) ret void } -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 + +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/imm.ll =================================================================== --- test/CodeGen/AMDGPU/imm.ll +++ test/CodeGen/AMDGPU/imm.ll @@ -667,3 +667,18 @@ store double 4096.0, double addrspace(1)* %out ret void } + +; GCN-LABEL: {{^}}literal_folding: +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f4353f8, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0xbf4353f8, v{{[0-9]+}} +define amdgpu_vs void @literal_folding(float %arg) { +main_body: + %tmp = fmul float %arg, 0x3FE86A7F00000000 + %tmp1 = fmul float %arg, 0xBFE86A7F00000000 + call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp, float %tmp, float %tmp1, float %tmp1, i1 true, i1 false) #0 + ret void +} + +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 + +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/insert-waits-exp.mir =================================================================== --- test/CodeGen/AMDGPU/insert-waits-exp.mir +++ test/CodeGen/AMDGPU/insert-waits-exp.mir @@ -1,18 +1,18 @@ # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s --- | - define amdgpu_ps <4 x float> @exp_done_waitcnt(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) { + define amdgpu_ps <4 x float> @exp_done_waitcnt(<4 x i32> inreg, <4 x + i32> inreg, i32 inreg %w, float %v) #0 { %a = load volatile float, float addrspace(1)* undef %b = load volatile float, float addrspace(1)* undef %c = load volatile float, float addrspace(1)* undef %d = load volatile float, float addrspace(1)* undef - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %a, float %b, float %c, float %d) + call void @llvm.amdgcn.exp.f32(i32 15, i32 1, float %a, float %b, float %c, float %d, i1 true, i1 false) ret <4 x float> } - declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 - attributes #0 = { readnone } - attributes #1 = { nounwind } + attributes #0 = { nounwind } ... --- Index: test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll +++ test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll @@ -4,15 +4,14 @@ ; SI-LABEL: {{^}}kill_gs_const: ; SI-NOT: v_cmpx_le_f32 ; SI: s_mov_b64 exec, 0 - define amdgpu_gs void @kill_gs_const() { main_body: - %0 = icmp ule i32 0, 3 - %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00 - call void @llvm.AMDGPU.kill(float %1) - %2 = icmp ule i32 3, 0 - %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00 - call void @llvm.AMDGPU.kill(float %3) + %tmp = icmp ule i32 0, 3 + %tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00 + call void @llvm.AMDGPU.kill(float %tmp1) + %tmp2 = icmp ule i32 3, 0 + %tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00 + call void @llvm.AMDGPU.kill(float %tmp3) ret void } @@ -21,16 +20,16 @@ ; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}} ; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}} ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] -define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) { +define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) { entry: - %tmp0 = fcmp olt float %13, 0.0 - call void @llvm.AMDGPU.kill(float %14) - %tmp1 = select i1 %tmp0, float 1.0, float 0.0 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 1, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1) + %tmp0 = fcmp olt float %arg13, 0.000000e+00 + call void @llvm.AMDGPU.kill(float %arg14) + %tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 ret void } -declare void @llvm.AMDGPU.kill(float) -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare void @llvm.AMDGPU.kill(float) #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -!0 = !{!"const", null, i32 1} +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.ll @@ -1,146 +1,144 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=CHECK,VI %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s -;CHECK-LABEL: {{^}}image_load_v4i32: -;CHECK: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm -;CHECK: s_waitcnt vmcnt(0) -define amdgpu_ps <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) { +; GCN-LABEL: {{^}}image_load_v4i32: +; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm +; GCN: s_waitcnt vmcnt(0) +define amdgpu_ps <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) ret <4 x float> %tex } -;CHECK-LABEL: {{^}}image_load_v2i32: -;CHECK: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm -;CHECK: s_waitcnt vmcnt(0) -define amdgpu_ps <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) { +; GCN-LABEL: {{^}}image_load_v2i32: +; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm +; GCN: s_waitcnt vmcnt(0) +define amdgpu_ps <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) ret <4 x float> %tex } -;CHECK-LABEL: {{^}}image_load_i32: -;CHECK: image_load v[0:3], v0, s[0:7] dmask:0xf unorm -;CHECK: s_waitcnt vmcnt(0) -define amdgpu_ps <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) { +; GCN-LABEL: {{^}}image_load_i32: +; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm +; GCN: s_waitcnt vmcnt(0) +define amdgpu_ps <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) #0 { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) ret <4 x float> %tex } -;CHECK-LABEL: {{^}}image_load_mip: -;CHECK: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm -;CHECK: s_waitcnt vmcnt(0) -define amdgpu_ps <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) { +; GCN-LABEL: {{^}}image_load_mip: +; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm +; GCN: s_waitcnt vmcnt(0) +define amdgpu_ps <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %tex = call <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) ret <4 x float> %tex } -;CHECK-LABEL: {{^}}image_load_1: -;CHECK: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm -;CHECK: s_waitcnt vmcnt(0) -define amdgpu_ps float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) { +; GCN-LABEL: {{^}}image_load_1: +; GCN: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm +; GCN: s_waitcnt vmcnt(0) +define amdgpu_ps float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) %elt = extractelement <4 x float> %tex, i32 0 -; Only first component used, test that dmask etc. is changed accordingly ret float %elt } -;CHECK-LABEL: {{^}}image_load_f32_v2i32: -;CHECK: image_load {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm -;CHECK: s_waitcnt vmcnt(0) -define amdgpu_ps float @image_load_f32_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) { +; GCN-LABEL: {{^}}image_load_f32_v2i32: +; GCN: image_load {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm +; GCN: s_waitcnt vmcnt(0) +define amdgpu_ps float @image_load_f32_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 { main_body: - %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 1, i1 0, i1 0, i1 0, i1 0) + %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false) ret float %tex } -;CHECK-LABEL: {{^}}image_load_v2f32_v4i32: -;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm -;CHECK: s_waitcnt vmcnt(0) -define amdgpu_ps <2 x float> @image_load_v2f32_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) { +; GCN-LABEL: {{^}}image_load_v2f32_v4i32: +; GCN: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm +; GCN: s_waitcnt vmcnt(0) +define amdgpu_ps <2 x float> @image_load_v2f32_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { main_body: - %tex = call <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 3, i1 0, i1 0, i1 0, i1 0) + %tex = call <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 3, i1 false, i1 false, i1 false, i1 false) ret <2 x float> %tex } - -;CHECK-LABEL: {{^}}image_store_v4i32: -;CHECK: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm -define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) { +; GCN-LABEL: {{^}}image_store_v4i32: +; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm +define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 { main_body: - call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) ret void } -;CHECK-LABEL: {{^}}image_store_v2i32: -;CHECK: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm -define amdgpu_ps void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) { +; GCN-LABEL: {{^}}image_store_v2i32: +; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm +define amdgpu_ps void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) #0 { main_body: - call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %data, <2 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %data, <2 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) ret void } -;CHECK-LABEL: {{^}}image_store_i32: -;CHECK: image_store v[0:3], v4, s[0:7] dmask:0xf unorm -define amdgpu_ps void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) { +; GCN-LABEL: {{^}}image_store_i32: +; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm +define amdgpu_ps void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) #0 { main_body: - call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) ret void } -;CHECK-LABEL: {{^}}image_store_f32_i32: -;CHECK: image_store {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm -define amdgpu_ps void @image_store_f32_i32(<8 x i32> inreg %rsrc, float %data, i32 %coords) { +; GCN-LABEL: {{^}}image_store_f32_i32: +; GCN: image_store {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm +define amdgpu_ps void @image_store_f32_i32(<8 x i32> inreg %rsrc, float %data, i32 %coords) #0 { main_body: - call void @llvm.amdgcn.image.store.f32.i32.v8i32(float %data, i32 %coords, <8 x i32> %rsrc, i32 1, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.f32.i32.v8i32(float %data, i32 %coords, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false) ret void } -;CHECK-LABEL: {{^}}image_store_v2f32_v4i32: -;CHECK: image_store {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm -define amdgpu_ps void @image_store_v2f32_v4i32(<8 x i32> inreg %rsrc, <2 x float> %data, <4 x i32> %coords) { +; GCN-LABEL: {{^}}image_store_v2f32_v4i32: +; GCN: image_store {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm +define amdgpu_ps void @image_store_v2f32_v4i32(<8 x i32> inreg %rsrc, <2 x float> %data, <4 x i32> %coords) #0 { main_body: - call void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 false, i1 false, i1 false, i1 false) ret void } -;CHECK-LABEL: {{^}}image_store_mip: -;CHECK: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm -define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) { +; GCN-LABEL: {{^}}image_store_mip: +; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm +define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 { main_body: - call void @llvm.amdgcn.image.store.mip.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.mip.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) ret void } -;CHECK-LABEL: {{^}}getresinfo: -;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define amdgpu_ps void @getresinfo() { +; GCN-LABEL: {{^}}getresinfo: +; GCN: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf +define amdgpu_ps void @getresinfo() #0 { main_body: - %r = call <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32 undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0) + %r = call <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32 undef, <8 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false) %r0 = extractelement <4 x float> %r, i32 0 %r1 = extractelement <4 x float> %r, i32 1 %r2 = extractelement <4 x float> %r, i32 2 %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r0, float %r1, float %r2, float %r3, i1 true, i1 true) #0 ret void } ; Ideally, the register allocator would avoid the wait here ; -;CHECK-LABEL: {{^}}image_store_wait: -;CHECK: image_store v[0:3], v4, s[0:7] dmask:0xf unorm -;CHECK: s_waitcnt vmcnt(0) expcnt(0) -;CHECK: image_load v[0:3], v4, s[8:15] dmask:0xf unorm -;CHECK: s_waitcnt vmcnt(0) -;CHECK: image_store v[0:3], v4, s[16:23] dmask:0xf unorm -define amdgpu_ps void @image_store_wait(<8 x i32> inreg, <8 x i32> inreg, <8 x i32> inreg, <4 x float>, i32) { +; GCN-LABEL: {{^}}image_store_wait: +; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm +; GCN: s_waitcnt vmcnt(0) expcnt(0) +; GCN: image_load v[0:3], v4, s[8:15] dmask:0xf unorm +; GCN: s_waitcnt vmcnt(0) +; GCN: image_store v[0:3], v4, s[16:23] dmask:0xf unorm +define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %arg1, <8 x i32> inreg %arg2, <4 x float> %arg3, i32 %arg4) #0 { main_body: - call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %3, i32 %4, <8 x i32> %0, i32 15, i1 0, i1 0, i1 0, i1 0) - %data = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %4, <8 x i32> %1, i32 15, i1 0, i1 0, i1 0, i1 0) - call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %4, <8 x i32> %2, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %arg3, i32 %arg4, <8 x i32> %arg, i32 15, i1 false, i1 false, i1 false, i1 false) + %data = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %arg4, <8 x i32> %arg1, i32 15, i1 false, i1 false, i1 false, i1 false) + call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %arg4, <8 x i32> %arg2, i32 15, i1 false, i1 false, i1 false, i1 false) ret void } @@ -149,21 +147,22 @@ ; VI-LABEL: image_load_mmo ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 -define amdgpu_ps void @image_load_mmo(float addrspace(3)* %lds, <2 x i32> %c, <8 x i32> inreg %rsrc) { - store float 0.0, float addrspace(3)* %lds - %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) +define amdgpu_ps void @image_load_mmo(float addrspace(3)* %lds, <2 x i32> %c, <8 x i32> inreg %rsrc) #0 { +bb: + store float 0.000000e+00, float addrspace(3)* %lds + %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4 - store float 0.0, float addrspace(3)* %tmp2 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tex, float %tex, float %tex, float %tex) + store float 0.000000e+00, float addrspace(3)* %tmp2 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex, float %tex, float %tex, float %tex, i1 true, i1 true) #0 ret void } declare float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare void @llvm.amdgcn.image.store.f32.i32.v8i32(float, i32, <8 x i32>, i32, i1, i1, i1, i1) #0 -declare void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 +declare void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 declare void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0 declare void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 @@ -173,10 +172,9 @@ declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #0 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll @@ -3,7 +3,6 @@ ; RUN: llc -march=amdgcn -mcpu=kabini -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,16BANK %s ; RUN: llc -march=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,16BANK %s - ; GCN-LABEL: {{^}}v_interp: ; GCN-NOT: s_wqm ; GCN: s_mov_b32 m0, s{{[0-9]+}} @@ -11,17 +10,17 @@ ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr0.y{{$}} ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr0.y{{$}} ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}} -define amdgpu_ps void @v_interp(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x float>) { +define amdgpu_ps void @v_interp(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x float> %arg4) #0 { main_body: - %i = extractelement <2 x float> %4, i32 0 - %j = extractelement <2 x float> %4, i32 1 - %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 %3) - %p1_0 = call float @llvm.amdgcn.interp.p2(float %p0_0, float %j, i32 0, i32 0, i32 %3) - %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 %3) - %p1_1 = call float @llvm.amdgcn.interp.p2(float %p0_1, float %j, i32 1, i32 0, i32 %3) - %const = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %3) + %i = extractelement <2 x float> %arg4, i32 0 + %j = extractelement <2 x float> %arg4, i32 1 + %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 %arg3) + %p1_0 = call float @llvm.amdgcn.interp.p2(float %p0_0, float %j, i32 0, i32 0, i32 %arg3) + %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 %arg3) + %p1_1 = call float @llvm.amdgcn.interp.p2(float %p0_1, float %j, i32 1, i32 0, i32 %arg3) + %const = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %arg3) %w = fadd float %p1_1, %const - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %p0_0, float %p0_0, float %p1_1, float %w) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_0, float %p0_0, float %p1_1, float %w, i1 true, i1 true) #0 ret void } @@ -40,7 +39,8 @@ ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr63.w{{$}} ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.w{{$}} ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.x{{$}} -define amdgpu_ps void @v_interp_p1(float %i) { +define amdgpu_ps void @v_interp_p1(float %i) #0 { +bb: %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 256) %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 256) %p0_2 = call float @llvm.amdgcn.interp.p1(float %i, i32 2, i32 0, i32 256) @@ -80,7 +80,8 @@ ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr63.x{{$}} ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.x{{$}} ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.x{{$}} -define amdgpu_ps void @v_interp_p2(float %x, float %j) { +define amdgpu_ps void @v_interp_p2(float %x, float %j) #0 { +bb: %p2_0 = call float @llvm.amdgcn.interp.p2(float %x, float %j, i32 0, i32 0, i32 256) %p2_1 = call float @llvm.amdgcn.interp.p2(float %x, float %j, i32 1, i32 0, i32 256) %p2_2 = call float @llvm.amdgcn.interp.p2(float %x, float %j, i32 2, i32 0, i32 256) @@ -121,7 +122,8 @@ ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, p10, attr64.y{{$}} ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, invalid_param_3, attr64.y{{$}} ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, invalid_param_10, attr64.x{{$}} -define amdgpu_ps void @v_interp_mov(float %x, float %j) { +define amdgpu_ps void @v_interp_mov(float %x, float %j) #0 { +bb: %mov_0 = call float @llvm.amdgcn.interp.mov(i32 0, i32 0, i32 0, i32 256) %mov_1 = call float @llvm.amdgcn.interp.mov(i32 1, i32 0, i32 0, i32 256) %mov_2 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 256) @@ -164,12 +166,13 @@ ; VI-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}} ; VI: s_mov_b32 m0, -1{{$}} ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 -define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) { - store float 0.0, float addrspace(3)* %lds +define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) #0 { +bb: + store float 0.000000e+00, float addrspace(3)* %lds %tmp1 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 0) %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4 - store float 0.0, float addrspace(3)* %tmp2 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1) + store float 0.000000e+00, float addrspace(3)* %tmp2 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 ret void } @@ -178,43 +181,44 @@ ; GCN-LABEL: {{^}}v_interp_p1_bank16_bug: ; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]] -define amdgpu_ps void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg13, [17 x <4 x i32>] addrspace(2)* byval %arg14, [34 x <8 x i32>] addrspace(2)* byval %arg15, float inreg %arg16, i32 inreg %arg17, <2 x i32> %arg18, <2 x i32> %arg19, <2 x i32> %arg20, <3 x i32> %arg21, <2 x i32> %arg22, <2 x i32> %arg23, <2 x i32> %arg24, float %arg25, float %arg26, float %arg27, float %arg28, float %arg29, float %arg30, i32 %arg31, float %arg32, float %arg33) { +define amdgpu_ps void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg13, [17 x <4 x i32>] addrspace(2)* byval %arg14, [34 x <8 x i32>] addrspace(2)* byval %arg15, float inreg %arg16, i32 inreg %arg17, <2 x i32> %arg18, <2 x i32> %arg19, <2 x i32> %arg20, <3 x i32> %arg21, <2 x i32> %arg22, <2 x i32> %arg23, <2 x i32> %arg24, float %arg25, float %arg26, float %arg27, float %arg28, float %arg29, float %arg30, i32 %arg31, float %arg32, float %arg33) #0 { main_body: %i.i = extractelement <2 x i32> %arg19, i32 0 %j.i = extractelement <2 x i32> %arg19, i32 1 %i.f.i = bitcast i32 %i.i to float %j.f.i = bitcast i32 %j.i to float - %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg17) #1 - %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg17) #1 + %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg17) #0 + %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg17) #0 %i.i7 = extractelement <2 x i32> %arg19, i32 0 %j.i8 = extractelement <2 x i32> %arg19, i32 1 %i.f.i9 = bitcast i32 %i.i7 to float %j.f.i10 = bitcast i32 %j.i8 to float - %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 1, i32 0, i32 %arg17) #1 - %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 1, i32 0, i32 %arg17) #1 + %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 1, i32 0, i32 %arg17) #0 + %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 1, i32 0, i32 %arg17) #0 %i.i1 = extractelement <2 x i32> %arg19, i32 0 %j.i2 = extractelement <2 x i32> %arg19, i32 1 %i.f.i3 = bitcast i32 %i.i1 to float %j.f.i4 = bitcast i32 %j.i2 to float - %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 0, i32 %arg17) #1 - %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 0, i32 %arg17) #1 + %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 0, i32 %arg17) #0 + %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 0, i32 %arg17) #0 %tmp = call float @llvm.fabs.f32(float %p2.i) %tmp34 = call float @llvm.fabs.f32(float %p2.i12) %tmp35 = call float @llvm.fabs.f32(float %p2.i6) %tmp36 = call i32 @llvm.SI.packf16(float %tmp, float %tmp34) - %tmp37 = bitcast i32 %tmp36 to float + %tmp37 = bitcast i32 %tmp36 to <2 x half> %tmp38 = call i32 @llvm.SI.packf16(float %tmp35, float 1.000000e+00) - %tmp39 = bitcast i32 %tmp38 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp37, float %tmp39, float %tmp37, float %tmp39) + %tmp39 = bitcast i32 %tmp38 to <2 x half> + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp37, <2 x half> %tmp39, i1 true, i1 true) #0 ret void } -declare float @llvm.fabs.f32(float) #0 -declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0 -declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0 -declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0 -declare i32 @llvm.SI.packf16(float, float) #0 -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare float @llvm.fabs.f32(float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 +declare i32 @llvm.SI.packf16(float, float) #1 -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll @@ -1,24 +1,22 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}mbcnt_intrinsics: ; GCN: v_mbcnt_lo_u32_b32_e64 [[LO:v[0-9]+]], -1, 0 ; SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]] ; VI: v_mbcnt_hi_u32_b32_e64 {{v[0-9]+}}, -1, [[LO]] - -define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) { +define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3) { main_body: - %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1 - %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) #1 - %4 = bitcast i32 %hi to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %4, float %4, float %4, float %4) + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) #0 + %tmp = bitcast i32 %hi to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp, float %tmp, float %tmp, float %tmp, i1 true, i1 true) #1 ret void } -declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 - -declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 +declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 -attributes #1 = { nounwind readnone } +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/lshl.ll =================================================================== --- test/CodeGen/AMDGPU/lshl.ll +++ /dev/null @@ -1,15 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s - -;CHECK: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1 - -define void @test(i32 %p) { - %i = mul i32 %p, 2 - %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) Index: test/CodeGen/AMDGPU/lshr.ll =================================================================== --- test/CodeGen/AMDGPU/lshr.ll +++ /dev/null @@ -1,15 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s - -;CHECK: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1 - -define void @test(i32 %p) { - %i = udiv i32 %p, 2 - %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) Index: test/CodeGen/AMDGPU/mulhu.ll =================================================================== --- test/CodeGen/AMDGPU/mulhu.ll +++ /dev/null @@ -1,17 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab -;CHECK: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}} -;CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0 - -define void @test(i32 %p) { - %i = udiv i32 %p, 3 - %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) Index: test/CodeGen/AMDGPU/ret.ll =================================================================== --- test/CodeGen/AMDGPU/ret.ll +++ test/CodeGen/AMDGPU/ret.ll @@ -1,25 +1,24 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - ; GCN-LABEL: {{^}}vgpr: ; GCN: v_mov_b32_e32 v1, v0 ; GCN-DAG: v_add_f32_e32 v0, 1.0, v1 -; GCN-DAG: exp mrt0 v1, v1, v1, v1 done compr vm +; GCN-DAG: exp mrt0 v1, v1, v1, v1 done vm ; GCN: s_waitcnt expcnt(0) ; GCN-NOT: s_endpgm -define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) - %x = fadd float %3, 1.0 - %a = insertvalue {float, float} undef, float %x, 0 - %b = insertvalue {float, float} %a, float %3, 1 - ret {float, float} %b +define amdgpu_vs { float, float } @vgpr([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { +bb: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0 + %x = fadd float %arg3, 1.000000e+00 + %a = insertvalue { float, float } undef, float %x, 0 + %b = insertvalue { float, float } %a, float %arg3, 1 + ret { float, float } %b } ; GCN-LABEL: {{^}}vgpr_literal: ; GCN: v_mov_b32_e32 v4, v0 -; GCN: exp mrt0 v4, v4, v4, v4 done compr vm +; GCN: exp mrt0 v4, v4, v4, v4 done vm ; GCN-DAG: v_mov_b32_e32 v0, 1.0 ; GCN-DAG: v_mov_b32_e32 v1, 2.0 @@ -27,12 +26,12 @@ ; GCN-DAG: v_mov_b32_e32 v3, -1.0 ; GCN: s_waitcnt expcnt(0) ; GCN-NOT: s_endpgm -define amdgpu_vs {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) - ret {float, float, float, float} {float 1.0, float 2.0, float 4.0, float -1.0} +define amdgpu_vs { float, float, float, float } @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { +bb: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0 + ret { float, float, float, float } { float 1.000000e+00, float 2.000000e+00, float 4.000000e+00, float -1.000000e+00 } } - ; GCN: .long 165580 ; GCN-NEXT: .long 562 ; GCN-NEXT: .long 165584 @@ -44,24 +43,24 @@ ; GCN: v_mov_b32_e32 v3, v4 ; GCN: v_mov_b32_e32 v4, v6 ; GCN-NOT: s_endpgm -define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { - %i0 = extractelement <2 x i32> %4, i32 0 - %i1 = extractelement <2 x i32> %4, i32 1 - %i2 = extractelement <2 x i32> %7, i32 0 - %i3 = extractelement <2 x i32> %8, i32 0 +define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 { +bb: + %i0 = extractelement <2 x i32> %arg4, i32 0 + %i1 = extractelement <2 x i32> %arg4, i32 1 + %i2 = extractelement <2 x i32> %arg7, i32 0 + %i3 = extractelement <2 x i32> %arg8, i32 0 %f0 = bitcast i32 %i0 to float %f1 = bitcast i32 %i1 to float %f2 = bitcast i32 %i2 to float %f3 = bitcast i32 %i3 to float - %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0 - %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1 - %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2 - %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3 - %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4 - ret {float, float, float, float, float} %r4 + %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0 + %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1 + %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2 + %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3 + %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4 + ret { float, float, float, float, float } %r4 } - ; GCN: .long 165580 ; GCN-NEXT: .long 1 ; GCN-NEXT: .long 165584 @@ -69,11 +68,11 @@ ; GCN-LABEL: {{^}}ps_input_ena_no_inputs: ; GCN: v_mov_b32_e32 v0, 1.0 ; GCN-NOT: s_endpgm -define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { - ret float 1.0 +define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 { +bb: + ret float 1.000000e+00 } - ; GCN: .long 165580 ; GCN-NEXT: .long 2081 ; GCN-NEXT: .long 165584 @@ -83,14 +82,14 @@ ; GCN-DAG: v_mov_b32_e32 v1, v2 ; GCN: v_mov_b32_e32 v2, v3 ; GCN-NOT: s_endpgm -define amdgpu_ps {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { - %f = bitcast <2 x i32> %8 to <2 x float> - %s = insertvalue {float, <2 x float>} undef, float %14, 0 - %s1 = insertvalue {float, <2 x float>} %s, <2 x float> %f, 1 - ret {float, <2 x float>} %s1 +define amdgpu_ps { float, <2 x float> } @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 { +bb: + %f = bitcast <2 x i32> %arg8 to <2 x float> + %s = insertvalue { float, <2 x float> } undef, float %arg14, 0 + %s1 = insertvalue { float, <2 x float> } %s, <2 x float> %f, 1 + ret { float, <2 x float> } %s1 } - ; GCN: .long 165580 ; GCN-NEXT: .long 562 ; GCN-NEXT: .long 165584 @@ -102,25 +101,24 @@ ; GCN-DAG: v_mov_b32_e32 v3, v6 ; GCN-DAG: v_mov_b32_e32 v4, v8 ; GCN-NOT: s_endpgm -attributes #1 = { "InitialPSInputAddr"="1" } -define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 { - %i0 = extractelement <2 x i32> %4, i32 0 - %i1 = extractelement <2 x i32> %4, i32 1 - %i2 = extractelement <2 x i32> %7, i32 0 - %i3 = extractelement <2 x i32> %8, i32 0 +define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #2 { +bb: + %i0 = extractelement <2 x i32> %arg4, i32 0 + %i1 = extractelement <2 x i32> %arg4, i32 1 + %i2 = extractelement <2 x i32> %arg7, i32 0 + %i3 = extractelement <2 x i32> %arg8, i32 0 %f0 = bitcast i32 %i0 to float %f1 = bitcast i32 %i1 to float %f2 = bitcast i32 %i2 to float %f3 = bitcast i32 %i3 to float - %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0 - %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1 - %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2 - %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3 - %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4 - ret {float, float, float, float, float} %r4 + %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0 + %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1 + %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2 + %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3 + %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4 + ret { float, float, float, float, float } %r4 } - ; GCN: .long 165580 ; GCN-NEXT: .long 562 ; GCN-NEXT: .long 165584 @@ -132,25 +130,24 @@ ; GCN: v_mov_b32_e32 v3, v8 ; GCN: v_mov_b32_e32 v4, v12 ; GCN-NOT: s_endpgm -attributes #2 = { "InitialPSInputAddr"="119" } -define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #2 { - %i0 = extractelement <2 x i32> %4, i32 0 - %i1 = extractelement <2 x i32> %4, i32 1 - %i2 = extractelement <2 x i32> %7, i32 0 - %i3 = extractelement <2 x i32> %8, i32 0 +define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #3 { +bb: + %i0 = extractelement <2 x i32> %arg4, i32 0 + %i1 = extractelement <2 x i32> %arg4, i32 1 + %i2 = extractelement <2 x i32> %arg7, i32 0 + %i3 = extractelement <2 x i32> %arg8, i32 0 %f0 = bitcast i32 %i0 to float %f1 = bitcast i32 %i1 to float %f2 = bitcast i32 %i2 to float %f3 = bitcast i32 %i3 to float - %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0 - %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1 - %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2 - %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3 - %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4 - ret {float, float, float, float, float} %r4 + %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0 + %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1 + %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2 + %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3 + %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4 + ret { float, float, float, float, float } %r4 } - ; GCN: .long 165580 ; GCN-NEXT: .long 562 ; GCN-NEXT: .long 165584 @@ -162,38 +159,37 @@ ; GCN: v_mov_b32_e32 v3, v4 ; GCN: v_mov_b32_e32 v4, v8 ; GCN-NOT: s_endpgm -attributes #3 = { "InitialPSInputAddr"="418" } -define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #3 { - %i0 = extractelement <2 x i32> %4, i32 0 - %i1 = extractelement <2 x i32> %4, i32 1 - %i2 = extractelement <2 x i32> %7, i32 0 - %i3 = extractelement <2 x i32> %8, i32 0 +define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #4 { +bb: + %i0 = extractelement <2 x i32> %arg4, i32 0 + %i1 = extractelement <2 x i32> %arg4, i32 1 + %i2 = extractelement <2 x i32> %arg7, i32 0 + %i3 = extractelement <2 x i32> %arg8, i32 0 %f0 = bitcast i32 %i0 to float %f1 = bitcast i32 %i1 to float %f2 = bitcast i32 %i2 to float %f3 = bitcast i32 %i3 to float - %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0 - %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1 - %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2 - %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3 - %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4 - ret {float, float, float, float, float} %r4 + %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0 + %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1 + %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2 + %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3 + %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4 + ret { float, float, float, float, float } %r4 } - ; GCN-LABEL: {{^}}sgpr: ; GCN: s_add_i32 s0, s3, 2 ; GCN: s_mov_b32 s2, s3 ; GCN-NOT: s_endpgm -define amdgpu_vs {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { - %x = add i32 %2, 2 - %a = insertvalue {i32, i32, i32} undef, i32 %x, 0 - %b = insertvalue {i32, i32, i32} %a, i32 %1, 1 - %c = insertvalue {i32, i32, i32} %a, i32 %2, 2 - ret {i32, i32, i32} %c +define amdgpu_vs { i32, i32, i32 } @sgpr([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { +bb: + %x = add i32 %arg2, 2 + %a = insertvalue { i32, i32, i32 } undef, i32 %x, 0 + %b = insertvalue { i32, i32, i32 } %a, i32 %arg1, 1 + %c = insertvalue { i32, i32, i32 } %a, i32 %arg2, 2 + ret { i32, i32, i32 } %c } - ; GCN-LABEL: {{^}}sgpr_literal: ; GCN: s_mov_b32 s0, 5 ; GCN-NOT: s_mov_b32 s0, s0 @@ -201,37 +197,37 @@ ; GCN-DAG: s_mov_b32 s2, 7 ; GCN-DAG: s_mov_b32 s3, 8 ; GCN-NOT: s_endpgm -define amdgpu_vs {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { - %x = add i32 %2, 2 - ret {i32, i32, i32, i32} {i32 5, i32 6, i32 7, i32 8} +define amdgpu_vs { i32, i32, i32, i32 } @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { +bb: + %x = add i32 %arg2, 2 + ret { i32, i32, i32, i32 } { i32 5, i32 6, i32 7, i32 8 } } - ; GCN-LABEL: {{^}}both: ; GCN: v_mov_b32_e32 v1, v0 -; GCN-DAG: exp mrt0 v1, v1, v1, v1 done compr vm +; GCN-DAG: exp mrt0 v1, v1, v1, v1 done vm ; GCN-DAG: v_add_f32_e32 v0, 1.0, v1 ; GCN-DAG: s_add_i32 s0, s3, 2 ; GCN-DAG: s_mov_b32 s1, s2 ; GCN: s_mov_b32 s2, s3 ; GCN: s_waitcnt expcnt(0) ; GCN-NOT: s_endpgm -define amdgpu_vs {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) - %v = fadd float %3, 1.0 - %s = add i32 %2, 2 - %a0 = insertvalue {float, i32, float, i32, i32} undef, float %v, 0 - %a1 = insertvalue {float, i32, float, i32, i32} %a0, i32 %s, 1 - %a2 = insertvalue {float, i32, float, i32, i32} %a1, float %3, 2 - %a3 = insertvalue {float, i32, float, i32, i32} %a2, i32 %1, 3 - %a4 = insertvalue {float, i32, float, i32, i32} %a3, i32 %2, 4 - ret {float, i32, float, i32, i32} %a4 +define amdgpu_vs { float, i32, float, i32, i32 } @both([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { +bb: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0 + %v = fadd float %arg3, 1.000000e+00 + %s = add i32 %arg2, 2 + %a0 = insertvalue { float, i32, float, i32, i32 } undef, float %v, 0 + %a1 = insertvalue { float, i32, float, i32, i32 } %a0, i32 %s, 1 + %a2 = insertvalue { float, i32, float, i32, i32 } %a1, float %arg3, 2 + %a3 = insertvalue { float, i32, float, i32, i32 } %a2, i32 %arg1, 3 + %a4 = insertvalue { float, i32, float, i32, i32 } %a3, i32 %arg2, 4 + ret { float, i32, float, i32, i32 } %a4 } - ; GCN-LABEL: {{^}}structure_literal: ; GCN: v_mov_b32_e32 v3, v0 -; GCN: exp mrt0 v3, v3, v3, v3 done compr vm +; GCN: exp mrt0 v3, v3, v3, v3 done vm ; GCN-DAG: v_mov_b32_e32 v0, 1.0 ; GCN-DAG: s_mov_b32 s0, 2 @@ -239,9 +235,16 @@ ; GCN-DAG: v_mov_b32_e32 v1, 2.0 ; GCN-DAG: v_mov_b32_e32 v2, 4.0 ; GCN: s_waitcnt expcnt(0) -define amdgpu_vs {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) - ret {{float, i32}, {i32, <2 x float>}} {{float, i32} {float 1.0, i32 2}, {i32, <2 x float>} {i32 3, <2 x float> }} +define amdgpu_vs { { float, i32 }, { i32, <2 x float> } } @structure_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { +bb: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0 + ret { { float, i32 }, { i32, <2 x float> } } { { float, i32 } { float 1.000000e+00, i32 2 }, { i32, <2 x float> } { i32 3, <2 x float> } } } -attributes #0 = { nounwind "InitialPSInputAddr"="0" } +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind "InitialPSInputAddr"="0" } +attributes #2 = { nounwind "InitialPSInputAddr"="1" } +attributes #3 = { nounwind "InitialPSInputAddr"="119" } +attributes #4 = { nounwind "InitialPSInputAddr"="418" } Index: test/CodeGen/AMDGPU/seto.ll =================================================================== --- test/CodeGen/AMDGPU/seto.ll +++ test/CodeGen/AMDGPU/seto.ll @@ -4,12 +4,9 @@ ; CHECK-LABEL: {{^}}main: ; CHECK: v_cmp_o_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]] ; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]] -define void @main(float %p) { +define amdgpu_ps float @main(float inreg %p) { main_body: %c = fcmp oeq float %p, %p %r = select i1 %c, float 1.000000e+00, float 0.000000e+00 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r) - ret void + ret float %r } - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) Index: test/CodeGen/AMDGPU/setuo.ll =================================================================== --- test/CodeGen/AMDGPU/setuo.ll +++ test/CodeGen/AMDGPU/setuo.ll @@ -4,12 +4,9 @@ ; CHECK-LABEL: {{^}}main: ; CHECK: v_cmp_u_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]] ; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]] -define void @main(float %p) { +define amdgpu_ps float @main(float inreg %p) { main_body: %c = fcmp une float %p, %p %r = select i1 %c, float 1.000000e+00, float 0.000000e+00 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r) - ret void + ret float %r } - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) Index: test/CodeGen/AMDGPU/sgpr-copy.ll =================================================================== --- test/CodeGen/AMDGPU/sgpr-copy.ll +++ test/CodeGen/AMDGPU/sgpr-copy.ll @@ -1,13 +1,10 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 - ; CHECK-LABEL: {{^}}phi1: ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0 ; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]] -define amdgpu_ps void @phi1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 { +define amdgpu_ps void @phi1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -25,13 +22,13 @@ ENDIF: ; preds = %ELSE, %main_body %temp.0 = phi float [ %tmp26, %ELSE ], [ %tmp21, %main_body ] %tmp27 = fadd float %temp.0, %tmp23 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %tmp27, float %tmp27, float 0.000000e+00, float 1.000000e+00) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp27, float %tmp27, float 0.000000e+00, float 1.000000e+00, i1 true, i1 true) #0 ret void } ; Make sure this program doesn't crash ; CHECK-LABEL: {{^}}phi2: -define amdgpu_ps void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 { main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -58,32 +55,32 @@ %j.i = extractelement <2 x i32> %arg5, i32 1 %i.f.i = bitcast i32 %i.i to float %j.f.i = bitcast i32 %j.i to float - %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg3) #0 - %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg3) #0 + %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg3) #1 + %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg3) #1 %i.i19 = extractelement <2 x i32> %arg5, i32 0 %j.i20 = extractelement <2 x i32> %arg5, i32 1 %i.f.i21 = bitcast i32 %i.i19 to float %j.f.i22 = bitcast i32 %j.i20 to float - %p1.i23 = call float @llvm.amdgcn.interp.p1(float %i.f.i21, i32 1, i32 0, i32 %arg3) #0 - %p2.i24 = call float @llvm.amdgcn.interp.p2(float %p1.i23, float %j.f.i22, i32 1, i32 0, i32 %arg3) #0 + %p1.i23 = call float @llvm.amdgcn.interp.p1(float %i.f.i21, i32 1, i32 0, i32 %arg3) #1 + %p2.i24 = call float @llvm.amdgcn.interp.p2(float %p1.i23, float %j.f.i22, i32 1, i32 0, i32 %arg3) #1 %i.i13 = extractelement <2 x i32> %arg5, i32 0 %j.i14 = extractelement <2 x i32> %arg5, i32 1 %i.f.i15 = bitcast i32 %i.i13 to float %j.f.i16 = bitcast i32 %j.i14 to float - %p1.i17 = call float @llvm.amdgcn.interp.p1(float %i.f.i15, i32 0, i32 1, i32 %arg3) #0 - %p2.i18 = call float @llvm.amdgcn.interp.p2(float %p1.i17, float %j.f.i16, i32 0, i32 1, i32 %arg3) #0 + %p1.i17 = call float @llvm.amdgcn.interp.p1(float %i.f.i15, i32 0, i32 1, i32 %arg3) #1 + %p2.i18 = call float @llvm.amdgcn.interp.p2(float %p1.i17, float %j.f.i16, i32 0, i32 1, i32 %arg3) #1 %i.i7 = extractelement <2 x i32> %arg5, i32 0 %j.i8 = extractelement <2 x i32> %arg5, i32 1 %i.f.i9 = bitcast i32 %i.i7 to float %j.f.i10 = bitcast i32 %j.i8 to float - %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 1, i32 1, i32 %arg3) #0 - %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 1, i32 1, i32 %arg3) #0 + %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 1, i32 1, i32 %arg3) #1 + %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 1, i32 1, i32 %arg3) #1 %i.i1 = extractelement <2 x i32> %arg5, i32 0 %j.i2 = extractelement <2 x i32> %arg5, i32 1 %i.f.i3 = bitcast i32 %i.i1 to float %j.f.i4 = bitcast i32 %j.i2 to float - %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 1, i32 %arg3) #0 - %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 1, i32 %arg3) #0 + %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 1, i32 %arg3) #1 + %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 1, i32 %arg3) #1 %tmp45 = bitcast float %p2.i to i32 %tmp46 = bitcast float %p2.i24 to i32 %tmp47 = insertelement <2 x i32> undef, i32 %tmp45, i32 0 @@ -168,16 +165,16 @@ %tmp111 = fsub float -0.000000e+00, %tmp105 %tmp112 = fmul float %tmp111, %tmp106 %tmp113 = call i32 @llvm.SI.packf16(float %tmp108, float %tmp110) - %tmp114 = bitcast i32 %tmp113 to float + %tmp114 = bitcast i32 %tmp113 to <2 x half> %tmp115 = call i32 @llvm.SI.packf16(float %tmp112, float 1.000000e+00) - %tmp116 = bitcast i32 %tmp115 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp114, float %tmp116, float %tmp114, float %tmp116) + %tmp116 = bitcast i32 %tmp115 to <2 x half> + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp114, <2 x half> %tmp116, i1 true, i1 true) #0 ret void } ; We just want ot make sure the program doesn't crash ; CHECK-LABEL: {{^}}loop: -define amdgpu_ps void @loop(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 { +define amdgpu_ps void @loop(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -204,7 +201,7 @@ br i1 %tmp33, label %IF, label %ENDIF IF: ; preds = %LOOP - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00, i1 true, i1 true) #0 ret void ENDIF: ; preds = %LOOP @@ -230,7 +227,7 @@ ; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[SAMPLE_LO]]:[[SAMPLE_HI]]{{\]}} ; CHECK: exp ; CHECK: s_endpgm -define amdgpu_ps void @sample_v3([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #1 { +define amdgpu_ps void @sample_v3([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { entry: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0 %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -261,7 +258,7 @@ %val.0 = phi float [ %val.if.0, %if ], [ %val.else.0, %else ] %val.1 = phi float [ %val.if.1, %if ], [ %val.else.1, %else ] %val.2 = phi float [ %val.if.2, %if ], [ %val.else.2, %else ] - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %val.0, float %val.1, float %val.2, float 0.000000e+00) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %val.0, float %val.1, float %val.2, float 0.000000e+00, i1 true, i1 true) #0 ret void } @@ -294,7 +291,7 @@ ; This test is just checking that we don't crash / assertion fail. ; CHECK-LABEL: {{^}}copy2: ; CHECK: s_endpgm -define amdgpu_ps void @copy2([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #1 { +define amdgpu_ps void @copy2([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { entry: br label %LOOP68 @@ -308,7 +305,7 @@ IF70: ; preds = %LOOP68 %q = icmp ne i32 %l, 13 %temp.8 = select i1 %q, float 1.000000e+00, float 0.000000e+00 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, i1 true, i1 true) #0 ret void ENDIF69: ; preds = %LOOP68 @@ -330,7 +327,7 @@ ; [[END]]: ; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[ADD]]{{\]}} ; CHECK: s_endpgm -define amdgpu_ps void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #1 { +define amdgpu_ps void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { bb: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0 %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !3 @@ -343,14 +340,14 @@ %j.i = extractelement <2 x i32> %arg7, i32 1 %i.f.i = bitcast i32 %i.i to float %j.f.i = bitcast i32 %j.i to float - %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg5) #1 - %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg5) #1 + %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg5) #0 + %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg5) #0 %i.i1 = extractelement <2 x i32> %arg7, i32 0 %j.i2 = extractelement <2 x i32> %arg7, i32 1 %i.f.i3 = bitcast i32 %i.i1 to float %j.f.i4 = bitcast i32 %j.i2 to float - %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 1, i32 0, i32 %arg5) #1 - %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 1, i32 0, i32 %arg5) #1 + %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 1, i32 0, i32 %arg5) #0 + %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 1, i32 0, i32 %arg5) #0 %tmp31 = bitcast float %tmp23 to i32 %tmp36 = icmp ne i32 %tmp31, 0 br i1 %tmp36, label %bb38, label %bb80 @@ -377,80 +374,58 @@ bb71: ; preds = %bb80, %bb38 %tmp72 = phi <4 x float> [ %tmp58, %bb38 ], [ %tmp87, %bb80 ] %tmp88 = extractelement <4 x float> %tmp72, i32 0 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp88, float %tmp88, float %tmp88, float %tmp88) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp88, float %tmp88, float %tmp88, float %tmp88, i1 true, i1 true) #0 ret void } ; Check the the resource descriptor is stored in an sgpr. ; CHECK-LABEL: {{^}}mimg_srsrc_sgpr: ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 -define amdgpu_ps void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg) #1 { +define amdgpu_ps void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg) #0 { bb: - %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i32 0, i32 %tid %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0 %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp10 = extractelement <4 x float> %tmp9, i32 0 %tmp12 = call i32 @llvm.SI.packf16(float undef, float %tmp10) - %tmp13 = bitcast i32 %tmp12 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp13, float undef, float undef, float undef) + %tmp13 = bitcast i32 %tmp12 to <2 x half> + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp13, <2 x half> undef, i1 true, i1 true) #0 ret void } ; Check the the sampler is stored in an sgpr. ; CHECK-LABEL: {{^}}mimg_ssamp_sgpr: ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 -define amdgpu_ps void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(2)* byval %arg) #1 { +define amdgpu_ps void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(2)* byval %arg) #0 { bb: - %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp7 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i32 0, i32 %tid %tmp8 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp7, align 16, !tbaa !0 %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> undef, <4 x i32> %tmp8, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp10 = extractelement <4 x float> %tmp9, i32 0 %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef) - %tmp13 = bitcast i32 %tmp12 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp13, float undef, float undef, float undef) + %tmp13 = bitcast i32 %tmp12 to <2 x half> + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp13, <2 x half> undef, i1 true, i1 true) #0 ret void } -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.fabs.f32(float) #0 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <8 x i32>, <16 x i8>, i32) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.rsq.f32(float) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.exp2.f32(float) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.pow.f32(float, float) #0 - -; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #0 - -; Function Attrs: nounwind readnone -declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0 - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } -attributes #2 = { nounwind readonly } +declare float @llvm.fabs.f32(float) #1 +declare float @llvm.amdgcn.rsq.f32(float) #1 +declare float @llvm.exp2.f32(float) #1 +declare float @llvm.pow.f32(float, float) #1 +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 + +declare i32 @llvm.SI.packf16(float, float) #1 +declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } !0 = !{!1, !1, i64 0, i32 1} !1 = !{!"const", !2} Index: test/CodeGen/AMDGPU/shl.ll =================================================================== --- test/CodeGen/AMDGPU/shl.ll +++ test/CodeGen/AMDGPU/shl.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=SI %s -; XUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() #0 @@ -466,4 +466,12 @@ ret void } +; FUNC-LABEL: {{^}}test_mul2: +; GCN: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1 +define void @test_mul2(i32 %p) { + %i = mul i32 %p, 2 + store volatile i32 %i, i32 addrspace(1)* undef + ret void +} + attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/si-literal-folding.ll =================================================================== --- test/CodeGen/AMDGPU/si-literal-folding.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s - -; GCN-LABEL: {{^}}main: -; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f4353f8, v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0xbf4353f8, v{{[0-9]+}} -define amdgpu_vs void @main(float) { -main_body: - %1 = fmul float %0, 0x3FE86A7F00000000 - %2 = fmul float %0, 0xBFE86A7F00000000 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %1, float %1, float %2, float %2) - ret void -} - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) Index: test/CodeGen/AMDGPU/si-lod-bias.ll =================================================================== --- test/CodeGen/AMDGPU/si-lod-bias.ll +++ test/CodeGen/AMDGPU/si-lod-bias.ll @@ -1,11 +1,11 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; This shader has the potential to generated illegal VGPR to SGPR copies if ; the wrong register class is used for the REG_SEQUENCE instructions. -; CHECK: {{^}}main: -; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, v{{\[[0-9]:[0-9]\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf +; GCN-LABEL: {{^}}main: +; GCN: image_sample_b v{{\[[0-9]:[0-9]\]}}, v{{\[[0-9]:[0-9]\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 @@ -40,26 +40,16 @@ %tmp37 = extractelement <4 x float> %tmp35, i32 1 %tmp38 = extractelement <4 x float> %tmp35, i32 2 %tmp39 = extractelement <4 x float> %tmp35, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %tmp36, float %tmp37, float %tmp38, float %tmp39) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp36, float %tmp37, float %tmp38, float %tmp39, i1 true, i1 true) #0 ret void } -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -; Function Attrs: nounwind readnone declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 - -; Function Attrs: nounwind readnone declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1 +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 +declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/si-scheduler.ll =================================================================== --- test/CodeGen/AMDGPU/si-scheduler.ll +++ test/CodeGen/AMDGPU/si-scheduler.ll @@ -3,7 +3,7 @@ ; The only way the subtarget knows that the si machine scheduler is being used ; is to specify -mattr=si-scheduler. If we just pass --misched=si, the backend ; won't know what scheduler we are using. -; RUN: llc -march=amdgcn -mcpu=SI --misched=si -mattr=si-scheduler < %s | FileCheck %s +; RUN: llc -march=amdgcn --misched=si -mattr=si-scheduler < %s | FileCheck %s ; The test checks the "si" machine scheduler pass works correctly. @@ -16,7 +16,7 @@ ; CHECK: s_waitcnt vmcnt(0) ; CHECK: exp ; CHECK: s_endpgm -define amdgpu_ps void @main([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) { +define amdgpu_ps void @main([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { main_body: %tmp = bitcast [34 x <8 x i32>] addrspace(2)* %arg3 to <32 x i8> addrspace(2)* %tmp22 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp, align 32, !tbaa !0 @@ -46,29 +46,22 @@ %tmp34 = extractelement <4 x float> %tmp31, i32 2 %tmp35 = extractelement <4 x float> %tmp31, i32 3 %tmp36 = call i32 @llvm.SI.packf16(float %tmp32, float %tmp33) - %tmp37 = bitcast i32 %tmp36 to float + %tmp37 = bitcast i32 %tmp36 to <2 x half> %tmp38 = call i32 @llvm.SI.packf16(float %tmp34, float %tmp35) - %tmp39 = bitcast i32 %tmp38 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp37, float %tmp39, float %tmp37, float %tmp39) + %tmp39 = bitcast i32 %tmp38 to <2 x half> + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp37, <2 x half> %tmp39, i1 true, i1 false) #0 ret void } -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 +declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 -; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #0 +declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare i32 @llvm.SI.packf16(float, float) #1 -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0 - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } !0 = !{!1, !1, i64 0, i32 1} !1 = !{!"const", !2} Index: test/CodeGen/AMDGPU/si-sgpr-spill.ll =================================================================== --- test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -728,10 +728,10 @@ %tmp579 = fmul float %tmp574, %tmp45 %tmp580 = fadd float %tmp579, %tmp556 %tmp581 = call i32 @llvm.SI.packf16(float %tmp576, float %tmp578) - %tmp582 = bitcast i32 %tmp581 to float + %tmp582 = bitcast i32 %tmp581 to <2 x half> %tmp583 = call i32 @llvm.SI.packf16(float %tmp580, float %tmp282) - %tmp584 = bitcast i32 %tmp583 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp582, float %tmp584, float %tmp582, float %tmp584) + %tmp584 = bitcast i32 %tmp583 to <2 x half> + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp582, <2 x half> %tmp584, i1 true, i1 true) #0 ret void ENDIF66: ; preds = %LOOP65 @@ -1807,10 +1807,10 @@ %max.0.i1 = call float @llvm.maxnum.f32(float %tmp774, float 0.000000e+00) %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00) %tmp776 = call i32 @llvm.SI.packf16(float %tmp768, float %tmp770) - %tmp777 = bitcast i32 %tmp776 to float + %tmp777 = bitcast i32 %tmp776 to <2 x half> %tmp778 = call i32 @llvm.SI.packf16(float %tmp772, float %clamp.i2) - %tmp779 = bitcast i32 %tmp778 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp777, float %tmp779, float %tmp777, float %tmp779) + %tmp779 = bitcast i32 %tmp778 to <2 x half> + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp777, <2 x half> %tmp779, i1 true, i1 true) #0 ret void ELSE214: ; preds = %ELSE211 @@ -1828,11 +1828,11 @@ declare float @llvm.exp2.f32(float) #1 declare float @llvm.ceil.f32(float) #1 -declare float @llvm.amdgcn.rsq.f32(float) #1 declare float @llvm.fabs.f32(float) #1 declare float @llvm.pow.f32(float, float) #1 declare float @llvm.minnum.f32(float, float) #1 declare float @llvm.maxnum.f32(float, float) #1 +declare float @llvm.amdgcn.rsq.f32(float) #1 declare float @llvm.amdgcn.cubeid(float, float, float) #1 declare float @llvm.amdgcn.cubesc(float, float, float) #1 declare float @llvm.amdgcn.cubetc(float, float, float) #1 @@ -1841,13 +1841,14 @@ declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1 +declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 + declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 declare <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 declare float @llvm.SI.load.const(<16 x i8>, i32) #1 declare i32 @llvm.SI.packf16(float, float) #1 -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/si-spill-cf.ll =================================================================== --- test/CodeGen/AMDGPU/si-spill-cf.ll +++ test/CodeGen/AMDGPU/si-spill-cf.ll @@ -6,270 +6,271 @@ ; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]] ; SI-NOT: v_readlane_b32 [[SAVED]] + define amdgpu_ps void @main() #0 { main_body: - %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 16) - %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32) - %2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80) - %3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84) - %4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88) - %5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96) - %6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100) - %7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104) - %8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112) - %9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116) - %10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120) - %11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128) - %12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132) - %13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136) - %14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144) - %15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148) - %16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152) - %17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160) - %18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164) - %19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168) - %20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176) - %21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180) - %22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184) - %23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192) - %24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196) - %25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200) - %26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208) - %27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212) - %28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216) - %29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224) - %30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228) - %31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232) - %32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240) - %33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244) - %34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248) - %35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256) - %36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260) - %37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264) - %38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272) - %39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276) - %40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280) - %41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288) - %42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292) - %43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296) - %44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304) - %45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308) - %46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312) - %47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320) - %48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324) - %49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328) - %50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336) - %51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340) - %52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344) - %53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352) - %54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356) - %55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360) - %56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368) - %57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372) - %58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376) - %59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384) - %60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388) - %61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392) - %62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400) - %63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404) - %64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408) - %65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416) - %66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420) + %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 16) + %tmp1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32) + %tmp2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80) + %tmp3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84) + %tmp4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88) + %tmp5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96) + %tmp6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100) + %tmp7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104) + %tmp8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112) + %tmp9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116) + %tmp10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120) + %tmp11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128) + %tmp12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132) + %tmp13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136) + %tmp14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144) + %tmp15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148) + %tmp16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152) + %tmp17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160) + %tmp18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164) + %tmp19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168) + %tmp20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176) + %tmp21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180) + %tmp22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184) + %tmp23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192) + %tmp24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196) + %tmp25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200) + %tmp26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208) + %tmp27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212) + %tmp28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216) + %tmp29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224) + %tmp30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228) + %tmp31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232) + %tmp32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240) + %tmp33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244) + %tmp34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248) + %tmp35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256) + %tmp36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260) + %tmp37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264) + %tmp38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272) + %tmp39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276) + %tmp40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280) + %tmp41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288) + %tmp42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292) + %tmp43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296) + %tmp44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304) + %tmp45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308) + %tmp46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312) + %tmp47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320) + %tmp48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324) + %tmp49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328) + %tmp50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336) + %tmp51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340) + %tmp52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344) + %tmp53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352) + %tmp54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356) + %tmp55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360) + %tmp56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368) + %tmp57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372) + %tmp58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376) + %tmp59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384) + %tmp60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388) + %tmp61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392) + %tmp62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400) + %tmp63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404) + %tmp64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408) + %tmp65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416) + %tmp66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420) br label %LOOP LOOP: ; preds = %ENDIF2795, %main_body %temp894.0 = phi float [ 0.000000e+00, %main_body ], [ %temp894.1, %ENDIF2795 ] %temp18.0 = phi float [ undef, %main_body ], [ %temp18.1, %ENDIF2795 ] %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) - %67 = icmp sgt i32 %tid, 4 - br i1 %67, label %ENDLOOP, label %ENDIF + %tmp67 = icmp sgt i32 %tid, 4 + br i1 %tmp67, label %ENDLOOP, label %ENDIF ENDLOOP: ; preds = %ELSE2566, %LOOP - %one.sub.a.i = fsub float 1.000000e+00, %0 + %one.sub.a.i = fsub float 1.000000e+00, %tmp %one.sub.ac.i = fmul float %one.sub.a.i, undef %result.i = fadd float fmul (float undef, float undef), %one.sub.ac.i - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float undef, float %result.i, float undef, float 1.000000e+00) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float undef, float %result.i, float undef, float 1.000000e+00, i1 true, i1 true) #0 ret void ENDIF: ; preds = %LOOP - %68 = fsub float %2, undef - %69 = fsub float %3, undef - %70 = fsub float %4, undef - %71 = fmul float %68, 0.000000e+00 - %72 = fmul float %69, undef - %73 = fmul float %70, undef - %74 = fsub float %6, undef - %75 = fsub float %7, undef - %76 = fmul float %74, undef - %77 = fmul float %75, 0.000000e+00 - %78 = call float @llvm.minnum.f32(float %73, float %77) - %79 = call float @llvm.maxnum.f32(float %71, float 0.000000e+00) - %80 = call float @llvm.maxnum.f32(float %72, float %76) - %81 = call float @llvm.maxnum.f32(float undef, float %78) - %82 = call float @llvm.minnum.f32(float %79, float %80) - %83 = call float @llvm.minnum.f32(float %82, float undef) - %84 = fsub float %14, undef - %85 = fsub float %15, undef - %86 = fsub float %16, undef - %87 = fmul float %84, undef - %88 = fmul float %85, undef - %89 = fmul float %86, undef - %90 = fsub float %17, undef - %91 = fsub float %18, undef - %92 = fsub float %19, undef - %93 = fmul float %90, 0.000000e+00 - %94 = fmul float %91, undef - %95 = fmul float %92, undef - %96 = call float @llvm.minnum.f32(float %88, float %94) - %97 = call float @llvm.maxnum.f32(float %87, float %93) - %98 = call float @llvm.maxnum.f32(float %89, float %95) - %99 = call float @llvm.maxnum.f32(float undef, float %96) - %100 = call float @llvm.maxnum.f32(float %99, float undef) - %101 = call float @llvm.minnum.f32(float %97, float undef) - %102 = call float @llvm.minnum.f32(float %101, float %98) - %103 = fsub float %30, undef - %104 = fsub float %31, undef - %105 = fmul float %103, 0.000000e+00 - %106 = fmul float %104, 0.000000e+00 - %107 = call float @llvm.minnum.f32(float undef, float %105) - %108 = call float @llvm.maxnum.f32(float undef, float %106) - %109 = call float @llvm.maxnum.f32(float undef, float %107) - %110 = call float @llvm.maxnum.f32(float %109, float undef) - %111 = call float @llvm.minnum.f32(float undef, float %108) - %112 = fsub float %32, undef - %113 = fsub float %33, undef - %114 = fsub float %34, undef - %115 = fmul float %112, 0.000000e+00 - %116 = fmul float %113, undef - %117 = fmul float %114, undef - %118 = fsub float %35, undef - %119 = fsub float %36, undef - %120 = fsub float %37, undef - %121 = fmul float %118, undef - %122 = fmul float %119, undef - %123 = fmul float %120, undef - %124 = call float @llvm.minnum.f32(float %115, float %121) - %125 = call float @llvm.minnum.f32(float %116, float %122) - %126 = call float @llvm.minnum.f32(float %117, float %123) - %127 = call float @llvm.maxnum.f32(float %124, float %125) - %128 = call float @llvm.maxnum.f32(float %127, float %126) - %129 = fsub float %38, undef - %130 = fsub float %39, undef - %131 = fsub float %40, undef - %132 = fmul float %129, 0.000000e+00 - %133 = fmul float %130, undef - %134 = fmul float %131, undef - %135 = fsub float %41, undef - %136 = fsub float %42, undef - %137 = fsub float %43, undef - %138 = fmul float %135, undef - %139 = fmul float %136, undef - %140 = fmul float %137, undef - %141 = call float @llvm.minnum.f32(float %132, float %138) - %142 = call float @llvm.minnum.f32(float %133, float %139) - %143 = call float @llvm.minnum.f32(float %134, float %140) - %144 = call float @llvm.maxnum.f32(float %141, float %142) - %145 = call float @llvm.maxnum.f32(float %144, float %143) - %146 = fsub float %44, undef - %147 = fsub float %45, undef - %148 = fsub float %46, undef - %149 = fmul float %146, 0.000000e+00 - %150 = fmul float %147, 0.000000e+00 - %151 = fmul float %148, undef - %152 = fsub float %47, undef - %153 = fsub float %48, undef - %154 = fsub float %49, undef - %155 = fmul float %152, undef - %156 = fmul float %153, 0.000000e+00 - %157 = fmul float %154, undef - %158 = call float @llvm.minnum.f32(float %149, float %155) - %159 = call float @llvm.minnum.f32(float %150, float %156) - %160 = call float @llvm.minnum.f32(float %151, float %157) - %161 = call float @llvm.maxnum.f32(float %158, float %159) - %162 = call float @llvm.maxnum.f32(float %161, float %160) - %163 = fsub float %50, undef - %164 = fsub float %51, undef - %165 = fsub float %52, undef - %166 = fmul float %163, undef - %167 = fmul float %164, 0.000000e+00 - %168 = fmul float %165, 0.000000e+00 - %169 = fsub float %53, undef - %170 = fsub float %54, undef - %171 = fsub float %55, undef - %172 = fdiv float 1.000000e+00, %temp18.0 - %173 = fmul float %169, undef - %174 = fmul float %170, undef - %175 = fmul float %171, %172 - %176 = call float @llvm.minnum.f32(float %166, float %173) - %177 = call float @llvm.minnum.f32(float %167, float %174) - %178 = call float @llvm.minnum.f32(float %168, float %175) - %179 = call float @llvm.maxnum.f32(float %176, float %177) - %180 = call float @llvm.maxnum.f32(float %179, float %178) - %181 = fsub float %62, undef - %182 = fsub float %63, undef - %183 = fsub float %64, undef - %184 = fmul float %181, 0.000000e+00 - %185 = fmul float %182, undef - %186 = fmul float %183, undef - %187 = fsub float %65, undef - %188 = fsub float %66, undef - %189 = fmul float %187, undef - %190 = fmul float %188, undef - %191 = call float @llvm.maxnum.f32(float %184, float %189) - %192 = call float @llvm.maxnum.f32(float %185, float %190) - %193 = call float @llvm.maxnum.f32(float %186, float undef) - %194 = call float @llvm.minnum.f32(float %191, float %192) - %195 = call float @llvm.minnum.f32(float %194, float %193) - %.temp292.7 = select i1 undef, float %162, float undef - %temp292.9 = select i1 false, float %180, float %.temp292.7 + %tmp68 = fsub float %tmp2, undef + %tmp69 = fsub float %tmp3, undef + %tmp70 = fsub float %tmp4, undef + %tmp71 = fmul float %tmp68, 0.000000e+00 + %tmp72 = fmul float %tmp69, undef + %tmp73 = fmul float %tmp70, undef + %tmp74 = fsub float %tmp6, undef + %tmp75 = fsub float %tmp7, undef + %tmp76 = fmul float %tmp74, undef + %tmp77 = fmul float %tmp75, 0.000000e+00 + %tmp78 = call float @llvm.minnum.f32(float %tmp73, float %tmp77) + %tmp79 = call float @llvm.maxnum.f32(float %tmp71, float 0.000000e+00) + %tmp80 = call float @llvm.maxnum.f32(float %tmp72, float %tmp76) + %tmp81 = call float @llvm.maxnum.f32(float undef, float %tmp78) + %tmp82 = call float @llvm.minnum.f32(float %tmp79, float %tmp80) + %tmp83 = call float @llvm.minnum.f32(float %tmp82, float undef) + %tmp84 = fsub float %tmp14, undef + %tmp85 = fsub float %tmp15, undef + %tmp86 = fsub float %tmp16, undef + %tmp87 = fmul float %tmp84, undef + %tmp88 = fmul float %tmp85, undef + %tmp89 = fmul float %tmp86, undef + %tmp90 = fsub float %tmp17, undef + %tmp91 = fsub float %tmp18, undef + %tmp92 = fsub float %tmp19, undef + %tmp93 = fmul float %tmp90, 0.000000e+00 + %tmp94 = fmul float %tmp91, undef + %tmp95 = fmul float %tmp92, undef + %tmp96 = call float @llvm.minnum.f32(float %tmp88, float %tmp94) + %tmp97 = call float @llvm.maxnum.f32(float %tmp87, float %tmp93) + %tmp98 = call float @llvm.maxnum.f32(float %tmp89, float %tmp95) + %tmp99 = call float @llvm.maxnum.f32(float undef, float %tmp96) + %tmp100 = call float @llvm.maxnum.f32(float %tmp99, float undef) + %tmp101 = call float @llvm.minnum.f32(float %tmp97, float undef) + %tmp102 = call float @llvm.minnum.f32(float %tmp101, float %tmp98) + %tmp103 = fsub float %tmp30, undef + %tmp104 = fsub float %tmp31, undef + %tmp105 = fmul float %tmp103, 0.000000e+00 + %tmp106 = fmul float %tmp104, 0.000000e+00 + %tmp107 = call float @llvm.minnum.f32(float undef, float %tmp105) + %tmp108 = call float @llvm.maxnum.f32(float undef, float %tmp106) + %tmp109 = call float @llvm.maxnum.f32(float undef, float %tmp107) + %tmp110 = call float @llvm.maxnum.f32(float %tmp109, float undef) + %tmp111 = call float @llvm.minnum.f32(float undef, float %tmp108) + %tmp112 = fsub float %tmp32, undef + %tmp113 = fsub float %tmp33, undef + %tmp114 = fsub float %tmp34, undef + %tmp115 = fmul float %tmp112, 0.000000e+00 + %tmp116 = fmul float %tmp113, undef + %tmp117 = fmul float %tmp114, undef + %tmp118 = fsub float %tmp35, undef + %tmp119 = fsub float %tmp36, undef + %tmp120 = fsub float %tmp37, undef + %tmp121 = fmul float %tmp118, undef + %tmp122 = fmul float %tmp119, undef + %tmp123 = fmul float %tmp120, undef + %tmp124 = call float @llvm.minnum.f32(float %tmp115, float %tmp121) + %tmp125 = call float @llvm.minnum.f32(float %tmp116, float %tmp122) + %tmp126 = call float @llvm.minnum.f32(float %tmp117, float %tmp123) + %tmp127 = call float @llvm.maxnum.f32(float %tmp124, float %tmp125) + %tmp128 = call float @llvm.maxnum.f32(float %tmp127, float %tmp126) + %tmp129 = fsub float %tmp38, undef + %tmp130 = fsub float %tmp39, undef + %tmp131 = fsub float %tmp40, undef + %tmp132 = fmul float %tmp129, 0.000000e+00 + %tmp133 = fmul float %tmp130, undef + %tmp134 = fmul float %tmp131, undef + %tmp135 = fsub float %tmp41, undef + %tmp136 = fsub float %tmp42, undef + %tmp137 = fsub float %tmp43, undef + %tmp138 = fmul float %tmp135, undef + %tmp139 = fmul float %tmp136, undef + %tmp140 = fmul float %tmp137, undef + %tmp141 = call float @llvm.minnum.f32(float %tmp132, float %tmp138) + %tmp142 = call float @llvm.minnum.f32(float %tmp133, float %tmp139) + %tmp143 = call float @llvm.minnum.f32(float %tmp134, float %tmp140) + %tmp144 = call float @llvm.maxnum.f32(float %tmp141, float %tmp142) + %tmp145 = call float @llvm.maxnum.f32(float %tmp144, float %tmp143) + %tmp146 = fsub float %tmp44, undef + %tmp147 = fsub float %tmp45, undef + %tmp148 = fsub float %tmp46, undef + %tmp149 = fmul float %tmp146, 0.000000e+00 + %tmp150 = fmul float %tmp147, 0.000000e+00 + %tmp151 = fmul float %tmp148, undef + %tmp152 = fsub float %tmp47, undef + %tmp153 = fsub float %tmp48, undef + %tmp154 = fsub float %tmp49, undef + %tmp155 = fmul float %tmp152, undef + %tmp156 = fmul float %tmp153, 0.000000e+00 + %tmp157 = fmul float %tmp154, undef + %tmp158 = call float @llvm.minnum.f32(float %tmp149, float %tmp155) + %tmp159 = call float @llvm.minnum.f32(float %tmp150, float %tmp156) + %tmp160 = call float @llvm.minnum.f32(float %tmp151, float %tmp157) + %tmp161 = call float @llvm.maxnum.f32(float %tmp158, float %tmp159) + %tmp162 = call float @llvm.maxnum.f32(float %tmp161, float %tmp160) + %tmp163 = fsub float %tmp50, undef + %tmp164 = fsub float %tmp51, undef + %tmp165 = fsub float %tmp52, undef + %tmp166 = fmul float %tmp163, undef + %tmp167 = fmul float %tmp164, 0.000000e+00 + %tmp168 = fmul float %tmp165, 0.000000e+00 + %tmp169 = fsub float %tmp53, undef + %tmp170 = fsub float %tmp54, undef + %tmp171 = fsub float %tmp55, undef + %tmp172 = fdiv float 1.000000e+00, %temp18.0 + %tmp173 = fmul float %tmp169, undef + %tmp174 = fmul float %tmp170, undef + %tmp175 = fmul float %tmp171, %tmp172 + %tmp176 = call float @llvm.minnum.f32(float %tmp166, float %tmp173) + %tmp177 = call float @llvm.minnum.f32(float %tmp167, float %tmp174) + %tmp178 = call float @llvm.minnum.f32(float %tmp168, float %tmp175) + %tmp179 = call float @llvm.maxnum.f32(float %tmp176, float %tmp177) + %tmp180 = call float @llvm.maxnum.f32(float %tmp179, float %tmp178) + %tmp181 = fsub float %tmp62, undef + %tmp182 = fsub float %tmp63, undef + %tmp183 = fsub float %tmp64, undef + %tmp184 = fmul float %tmp181, 0.000000e+00 + %tmp185 = fmul float %tmp182, undef + %tmp186 = fmul float %tmp183, undef + %tmp187 = fsub float %tmp65, undef + %tmp188 = fsub float %tmp66, undef + %tmp189 = fmul float %tmp187, undef + %tmp190 = fmul float %tmp188, undef + %tmp191 = call float @llvm.maxnum.f32(float %tmp184, float %tmp189) + %tmp192 = call float @llvm.maxnum.f32(float %tmp185, float %tmp190) + %tmp193 = call float @llvm.maxnum.f32(float %tmp186, float undef) + %tmp194 = call float @llvm.minnum.f32(float %tmp191, float %tmp192) + %tmp195 = call float @llvm.minnum.f32(float %tmp194, float %tmp193) + %.temp292.7 = select i1 undef, float %tmp162, float undef + %temp292.9 = select i1 false, float %tmp180, float %.temp292.7 %.temp292.9 = select i1 undef, float undef, float %temp292.9 - %196 = fcmp ogt float undef, 0.000000e+00 - %197 = fcmp olt float undef, %195 - %198 = and i1 %196, %197 - %199 = fcmp olt float undef, %.temp292.9 - %200 = and i1 %198, %199 - %temp292.11 = select i1 %200, float undef, float %.temp292.9 - %tid0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2 + %tmp196 = fcmp ogt float undef, 0.000000e+00 + %tmp197 = fcmp olt float undef, %tmp195 + %tmp198 = and i1 %tmp196, %tmp197 + %tmp199 = fcmp olt float undef, %.temp292.9 + %tmp200 = and i1 %tmp198, %tmp199 + %temp292.11 = select i1 %tmp200, float undef, float %.temp292.9 + %tid0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %cmp0 = icmp eq i32 %tid0, 0 br i1 %cmp0, label %IF2565, label %ELSE2566 IF2565: ; preds = %ENDIF - %tid1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2 + %tid1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %cmp1 = icmp eq i32 %tid1, 0 br i1 %cmp1, label %ENDIF2582, label %ELSE2584 ELSE2566: ; preds = %ENDIF - %tid2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2 + %tid2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tidf = bitcast i32 %tid2 to float - %201 = fcmp oeq float %temp292.11, %tidf - br i1 %201, label %ENDLOOP, label %ELSE2593 + %tmp201 = fcmp oeq float %temp292.11, %tidf + br i1 %tmp201, label %ENDLOOP, label %ELSE2593 ENDIF2564: ; preds = %ENDIF2594, %ENDIF2588 %temp894.1 = phi float [ undef, %ENDIF2588 ], [ %temp894.2, %ENDIF2594 ] - %temp18.1 = phi float [ %218, %ENDIF2588 ], [ undef, %ENDIF2594 ] - %202 = fsub float %5, undef - %203 = fmul float %202, undef - %204 = call float @llvm.maxnum.f32(float undef, float %203) - %205 = call float @llvm.minnum.f32(float %204, float undef) - %206 = call float @llvm.minnum.f32(float %205, float undef) - %207 = fcmp ogt float undef, 0.000000e+00 - %208 = fcmp olt float undef, 1.000000e+00 - %209 = and i1 %207, %208 - %tid3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2 + %temp18.1 = phi float [ %tmp218, %ENDIF2588 ], [ undef, %ENDIF2594 ] + %tmp202 = fsub float %tmp5, undef + %tmp203 = fmul float %tmp202, undef + %tmp204 = call float @llvm.maxnum.f32(float undef, float %tmp203) + %tmp205 = call float @llvm.minnum.f32(float %tmp204, float undef) + %tmp206 = call float @llvm.minnum.f32(float %tmp205, float undef) + %tmp207 = fcmp ogt float undef, 0.000000e+00 + %tmp208 = fcmp olt float undef, 1.000000e+00 + %tmp209 = and i1 %tmp207, %tmp208 + %tid3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tidf3 = bitcast i32 %tid3 to float - %210 = fcmp olt float %tidf3, %206 - %211 = and i1 %209, %210 - br i1 %211, label %ENDIF2795, label %ELSE2797 + %tmp210 = fcmp olt float %tidf3, %tmp206 + %tmp211 = and i1 %tmp209, %tmp210 + br i1 %tmp211, label %ENDIF2795, label %ELSE2797 ELSE2584: ; preds = %IF2565 br label %ENDIF2582 ENDIF2582: ; preds = %ELSE2584, %IF2565 - %212 = fadd float %1, undef - %213 = fadd float 0.000000e+00, %212 - %floor = call float @llvm.floor.f32(float %213) - %214 = fsub float %213, %floor - %tid4 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2 + %tmp212 = fadd float %tmp1, undef + %tmp213 = fadd float 0.000000e+00, %tmp212 + %floor = call float @llvm.floor.f32(float %tmp213) + %tmp214 = fsub float %tmp213, %floor + %tid4 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %cmp4 = icmp eq i32 %tid4, 0 br i1 %cmp4, label %IF2589, label %ELSE2590 @@ -280,61 +281,61 @@ br label %ENDIF2588 ENDIF2588: ; preds = %ELSE2590, %IF2589 - %215 = fsub float 1.000000e+00, %214 - %216 = call float @llvm.sqrt.f32(float %215) - %217 = fmul float %216, undef - %218 = fadd float %217, undef + %tmp215 = fsub float 1.000000e+00, %tmp214 + %tmp216 = call float @llvm.sqrt.f32(float %tmp215) + %tmp217 = fmul float %tmp216, undef + %tmp218 = fadd float %tmp217, undef br label %ENDIF2564 ELSE2593: ; preds = %ELSE2566 - %219 = fcmp oeq float %temp292.11, %81 - %220 = fcmp olt float %81, %83 - %221 = and i1 %219, %220 - br i1 %221, label %ENDIF2594, label %ELSE2596 + %tmp219 = fcmp oeq float %temp292.11, %tmp81 + %tmp220 = fcmp olt float %tmp81, %tmp83 + %tmp221 = and i1 %tmp219, %tmp220 + br i1 %tmp221, label %ENDIF2594, label %ELSE2596 ELSE2596: ; preds = %ELSE2593 - %222 = fcmp oeq float %temp292.11, %100 - %223 = fcmp olt float %100, %102 - %224 = and i1 %222, %223 - br i1 %224, label %ENDIF2594, label %ELSE2632 + %tmp222 = fcmp oeq float %temp292.11, %tmp100 + %tmp223 = fcmp olt float %tmp100, %tmp102 + %tmp224 = and i1 %tmp222, %tmp223 + br i1 %tmp224, label %ENDIF2594, label %ELSE2632 ENDIF2594: ; preds = %ELSE2788, %ELSE2785, %ELSE2782, %ELSE2779, %IF2775, %ELSE2761, %ELSE2758, %IF2757, %ELSE2704, %ELSE2686, %ELSE2671, %ELSE2668, %IF2667, %ELSE2632, %ELSE2596, %ELSE2593 %temp894.2 = phi float [ 0.000000e+00, %IF2667 ], [ 0.000000e+00, %ELSE2671 ], [ 0.000000e+00, %IF2757 ], [ 0.000000e+00, %ELSE2761 ], [ %temp894.0, %ELSE2758 ], [ 0.000000e+00, %IF2775 ], [ 0.000000e+00, %ELSE2779 ], [ 0.000000e+00, %ELSE2782 ], [ %.2848, %ELSE2788 ], [ 0.000000e+00, %ELSE2785 ], [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2632 ], [ 0.000000e+00, %ELSE2704 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ] - %225 = fmul float %temp894.2, undef + %tmp225 = fmul float %temp894.2, undef br label %ENDIF2564 ELSE2632: ; preds = %ELSE2596 br i1 undef, label %ENDIF2594, label %ELSE2650 ELSE2650: ; preds = %ELSE2632 - %226 = fcmp oeq float %temp292.11, %110 - %227 = fcmp olt float %110, %111 - %228 = and i1 %226, %227 - br i1 %228, label %IF2667, label %ELSE2668 + %tmp226 = fcmp oeq float %temp292.11, %tmp110 + %tmp227 = fcmp olt float %tmp110, %tmp111 + %tmp228 = and i1 %tmp226, %tmp227 + br i1 %tmp228, label %IF2667, label %ELSE2668 IF2667: ; preds = %ELSE2650 br i1 undef, label %ENDIF2594, label %ELSE2671 ELSE2668: ; preds = %ELSE2650 - %229 = fcmp oeq float %temp292.11, %128 - %230 = fcmp olt float %128, undef - %231 = and i1 %229, %230 - br i1 %231, label %ENDIF2594, label %ELSE2686 + %tmp229 = fcmp oeq float %temp292.11, %tmp128 + %tmp230 = fcmp olt float %tmp128, undef + %tmp231 = and i1 %tmp229, %tmp230 + br i1 %tmp231, label %ENDIF2594, label %ELSE2686 ELSE2671: ; preds = %IF2667 br label %ENDIF2594 ELSE2686: ; preds = %ELSE2668 - %232 = fcmp oeq float %temp292.11, %145 - %233 = fcmp olt float %145, undef - %234 = and i1 %232, %233 - br i1 %234, label %ENDIF2594, label %ELSE2704 + %tmp232 = fcmp oeq float %temp292.11, %tmp145 + %tmp233 = fcmp olt float %tmp145, undef + %tmp234 = and i1 %tmp232, %tmp233 + br i1 %tmp234, label %ENDIF2594, label %ELSE2704 ELSE2704: ; preds = %ELSE2686 - %235 = fcmp oeq float %temp292.11, %180 - %236 = fcmp olt float %180, undef - %237 = and i1 %235, %236 - br i1 %237, label %ENDIF2594, label %ELSE2740 + %tmp235 = fcmp oeq float %temp292.11, %tmp180 + %tmp236 = fcmp olt float %tmp180, undef + %tmp237 = and i1 %tmp235, %tmp236 + br i1 %tmp237, label %ENDIF2594, label %ELSE2740 ELSE2740: ; preds = %ELSE2704 br i1 undef, label %IF2757, label %ELSE2758 @@ -349,8 +350,8 @@ br label %ENDIF2594 IF2775: ; preds = %ELSE2758 - %238 = fcmp olt float undef, undef - br i1 %238, label %ENDIF2594, label %ELSE2779 + %tmp238 = fcmp olt float undef, undef + br i1 %tmp238, label %ENDIF2594, label %ELSE2779 ELSE2779: ; preds = %IF2775 br i1 undef, label %ENDIF2594, label %ELSE2782 @@ -359,39 +360,39 @@ br i1 undef, label %ENDIF2594, label %ELSE2785 ELSE2785: ; preds = %ELSE2782 - %239 = fcmp olt float undef, 0.000000e+00 - br i1 %239, label %ENDIF2594, label %ELSE2788 + %tmp239 = fcmp olt float undef, 0.000000e+00 + br i1 %tmp239, label %ENDIF2594, label %ELSE2788 ELSE2788: ; preds = %ELSE2785 - %240 = fcmp olt float 0.000000e+00, undef - %.2848 = select i1 %240, float -1.000000e+00, float 1.000000e+00 + %tmp240 = fcmp olt float 0.000000e+00, undef + %.2848 = select i1 %tmp240, float -1.000000e+00, float 1.000000e+00 br label %ENDIF2594 ELSE2797: ; preds = %ENDIF2564 - %241 = fsub float %8, undef - %242 = fsub float %9, undef - %243 = fsub float %10, undef - %244 = fmul float %241, undef - %245 = fmul float %242, undef - %246 = fmul float %243, undef - %247 = fsub float %11, undef - %248 = fsub float %12, undef - %249 = fsub float %13, undef - %250 = fmul float %247, undef - %251 = fmul float %248, undef - %252 = fmul float %249, undef - %253 = call float @llvm.minnum.f32(float %244, float %250) - %254 = call float @llvm.minnum.f32(float %245, float %251) - %255 = call float @llvm.maxnum.f32(float %246, float %252) - %256 = call float @llvm.maxnum.f32(float %253, float %254) - %257 = call float @llvm.maxnum.f32(float %256, float undef) - %258 = call float @llvm.minnum.f32(float undef, float %255) - %259 = fcmp ogt float %257, 0.000000e+00 - %260 = fcmp olt float %257, 1.000000e+00 - %261 = and i1 %259, %260 - %262 = fcmp olt float %257, %258 - %263 = and i1 %261, %262 - br i1 %263, label %ENDIF2795, label %ELSE2800 + %tmp241 = fsub float %tmp8, undef + %tmp242 = fsub float %tmp9, undef + %tmp243 = fsub float %tmp10, undef + %tmp244 = fmul float %tmp241, undef + %tmp245 = fmul float %tmp242, undef + %tmp246 = fmul float %tmp243, undef + %tmp247 = fsub float %tmp11, undef + %tmp248 = fsub float %tmp12, undef + %tmp249 = fsub float %tmp13, undef + %tmp250 = fmul float %tmp247, undef + %tmp251 = fmul float %tmp248, undef + %tmp252 = fmul float %tmp249, undef + %tmp253 = call float @llvm.minnum.f32(float %tmp244, float %tmp250) + %tmp254 = call float @llvm.minnum.f32(float %tmp245, float %tmp251) + %tmp255 = call float @llvm.maxnum.f32(float %tmp246, float %tmp252) + %tmp256 = call float @llvm.maxnum.f32(float %tmp253, float %tmp254) + %tmp257 = call float @llvm.maxnum.f32(float %tmp256, float undef) + %tmp258 = call float @llvm.minnum.f32(float undef, float %tmp255) + %tmp259 = fcmp ogt float %tmp257, 0.000000e+00 + %tmp260 = fcmp olt float %tmp257, 1.000000e+00 + %tmp261 = and i1 %tmp259, %tmp260 + %tmp262 = fcmp olt float %tmp257, %tmp258 + %tmp263 = and i1 %tmp261, %tmp262 + br i1 %tmp263, label %ENDIF2795, label %ELSE2800 ENDIF2795: ; preds = %ELSE2824, %ELSE2821, %ELSE2818, %ELSE2815, %ELSE2812, %ELSE2809, %ELSE2806, %ELSE2803, %ELSE2800, %ELSE2797, %ENDIF2564 br label %LOOP @@ -400,53 +401,53 @@ br i1 undef, label %ENDIF2795, label %ELSE2803 ELSE2803: ; preds = %ELSE2800 - %264 = fsub float %20, undef - %265 = fsub float %21, undef - %266 = fsub float %22, undef - %267 = fmul float %264, undef - %268 = fmul float %265, undef - %269 = fmul float %266, 0.000000e+00 - %270 = fsub float %23, undef - %271 = fsub float %24, undef - %272 = fsub float %25, undef - %273 = fmul float %270, undef - %274 = fmul float %271, undef - %275 = fmul float %272, undef - %276 = call float @llvm.minnum.f32(float %267, float %273) - %277 = call float @llvm.maxnum.f32(float %268, float %274) - %278 = call float @llvm.maxnum.f32(float %269, float %275) - %279 = call float @llvm.maxnum.f32(float %276, float undef) - %280 = call float @llvm.maxnum.f32(float %279, float undef) - %281 = call float @llvm.minnum.f32(float undef, float %277) - %282 = call float @llvm.minnum.f32(float %281, float %278) - %283 = fcmp ogt float %280, 0.000000e+00 - %284 = fcmp olt float %280, 1.000000e+00 - %285 = and i1 %283, %284 - %286 = fcmp olt float %280, %282 - %287 = and i1 %285, %286 - br i1 %287, label %ENDIF2795, label %ELSE2806 + %tmp264 = fsub float %tmp20, undef + %tmp265 = fsub float %tmp21, undef + %tmp266 = fsub float %tmp22, undef + %tmp267 = fmul float %tmp264, undef + %tmp268 = fmul float %tmp265, undef + %tmp269 = fmul float %tmp266, 0.000000e+00 + %tmp270 = fsub float %tmp23, undef + %tmp271 = fsub float %tmp24, undef + %tmp272 = fsub float %tmp25, undef + %tmp273 = fmul float %tmp270, undef + %tmp274 = fmul float %tmp271, undef + %tmp275 = fmul float %tmp272, undef + %tmp276 = call float @llvm.minnum.f32(float %tmp267, float %tmp273) + %tmp277 = call float @llvm.maxnum.f32(float %tmp268, float %tmp274) + %tmp278 = call float @llvm.maxnum.f32(float %tmp269, float %tmp275) + %tmp279 = call float @llvm.maxnum.f32(float %tmp276, float undef) + %tmp280 = call float @llvm.maxnum.f32(float %tmp279, float undef) + %tmp281 = call float @llvm.minnum.f32(float undef, float %tmp277) + %tmp282 = call float @llvm.minnum.f32(float %tmp281, float %tmp278) + %tmp283 = fcmp ogt float %tmp280, 0.000000e+00 + %tmp284 = fcmp olt float %tmp280, 1.000000e+00 + %tmp285 = and i1 %tmp283, %tmp284 + %tmp286 = fcmp olt float %tmp280, %tmp282 + %tmp287 = and i1 %tmp285, %tmp286 + br i1 %tmp287, label %ENDIF2795, label %ELSE2806 ELSE2806: ; preds = %ELSE2803 - %288 = fsub float %26, undef - %289 = fsub float %27, undef - %290 = fsub float %28, undef - %291 = fmul float %288, undef - %292 = fmul float %289, 0.000000e+00 - %293 = fmul float %290, undef - %294 = fsub float %29, undef - %295 = fmul float %294, undef - %296 = call float @llvm.minnum.f32(float %291, float %295) - %297 = call float @llvm.minnum.f32(float %292, float undef) - %298 = call float @llvm.maxnum.f32(float %293, float undef) - %299 = call float @llvm.maxnum.f32(float %296, float %297) - %300 = call float @llvm.maxnum.f32(float %299, float undef) - %301 = call float @llvm.minnum.f32(float undef, float %298) - %302 = fcmp ogt float %300, 0.000000e+00 - %303 = fcmp olt float %300, 1.000000e+00 - %304 = and i1 %302, %303 - %305 = fcmp olt float %300, %301 - %306 = and i1 %304, %305 - br i1 %306, label %ENDIF2795, label %ELSE2809 + %tmp288 = fsub float %tmp26, undef + %tmp289 = fsub float %tmp27, undef + %tmp290 = fsub float %tmp28, undef + %tmp291 = fmul float %tmp288, undef + %tmp292 = fmul float %tmp289, 0.000000e+00 + %tmp293 = fmul float %tmp290, undef + %tmp294 = fsub float %tmp29, undef + %tmp295 = fmul float %tmp294, undef + %tmp296 = call float @llvm.minnum.f32(float %tmp291, float %tmp295) + %tmp297 = call float @llvm.minnum.f32(float %tmp292, float undef) + %tmp298 = call float @llvm.maxnum.f32(float %tmp293, float undef) + %tmp299 = call float @llvm.maxnum.f32(float %tmp296, float %tmp297) + %tmp300 = call float @llvm.maxnum.f32(float %tmp299, float undef) + %tmp301 = call float @llvm.minnum.f32(float undef, float %tmp298) + %tmp302 = fcmp ogt float %tmp300, 0.000000e+00 + %tmp303 = fcmp olt float %tmp300, 1.000000e+00 + %tmp304 = and i1 %tmp302, %tmp303 + %tmp305 = fcmp olt float %tmp300, %tmp301 + %tmp306 = and i1 %tmp304, %tmp305 + br i1 %tmp306, label %ENDIF2795, label %ELSE2809 ELSE2809: ; preds = %ELSE2806 br i1 undef, label %ENDIF2795, label %ELSE2812 @@ -461,53 +462,42 @@ br i1 undef, label %ENDIF2795, label %ELSE2821 ELSE2821: ; preds = %ELSE2818 - %307 = fsub float %56, undef - %308 = fsub float %57, undef - %309 = fsub float %58, undef - %310 = fmul float %307, undef - %311 = fmul float %308, 0.000000e+00 - %312 = fmul float %309, undef - %313 = fsub float %59, undef - %314 = fsub float %60, undef - %315 = fsub float %61, undef - %316 = fmul float %313, undef - %317 = fmul float %314, undef - %318 = fmul float %315, undef - %319 = call float @llvm.maxnum.f32(float %310, float %316) - %320 = call float @llvm.maxnum.f32(float %311, float %317) - %321 = call float @llvm.maxnum.f32(float %312, float %318) - %322 = call float @llvm.minnum.f32(float %319, float %320) - %323 = call float @llvm.minnum.f32(float %322, float %321) - %324 = fcmp ogt float undef, 0.000000e+00 - %325 = fcmp olt float undef, 1.000000e+00 - %326 = and i1 %324, %325 - %327 = fcmp olt float undef, %323 - %328 = and i1 %326, %327 - br i1 %328, label %ENDIF2795, label %ELSE2824 + %tmp307 = fsub float %tmp56, undef + %tmp308 = fsub float %tmp57, undef + %tmp309 = fsub float %tmp58, undef + %tmp310 = fmul float %tmp307, undef + %tmp311 = fmul float %tmp308, 0.000000e+00 + %tmp312 = fmul float %tmp309, undef + %tmp313 = fsub float %tmp59, undef + %tmp314 = fsub float %tmp60, undef + %tmp315 = fsub float %tmp61, undef + %tmp316 = fmul float %tmp313, undef + %tmp317 = fmul float %tmp314, undef + %tmp318 = fmul float %tmp315, undef + %tmp319 = call float @llvm.maxnum.f32(float %tmp310, float %tmp316) + %tmp320 = call float @llvm.maxnum.f32(float %tmp311, float %tmp317) + %tmp321 = call float @llvm.maxnum.f32(float %tmp312, float %tmp318) + %tmp322 = call float @llvm.minnum.f32(float %tmp319, float %tmp320) + %tmp323 = call float @llvm.minnum.f32(float %tmp322, float %tmp321) + %tmp324 = fcmp ogt float undef, 0.000000e+00 + %tmp325 = fcmp olt float undef, 1.000000e+00 + %tmp326 = and i1 %tmp324, %tmp325 + %tmp327 = fcmp olt float undef, %tmp323 + %tmp328 = and i1 %tmp326, %tmp327 + br i1 %tmp328, label %ENDIF2795, label %ELSE2824 ELSE2824: ; preds = %ELSE2821 %.2849 = select i1 undef, float 0.000000e+00, float 1.000000e+00 br label %ENDIF2795 } -declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 - -; Function Attrs: nounwind readnone declare float @llvm.floor.f32(float) #1 - -; Function Attrs: nounwind readnone declare float @llvm.sqrt.f32(float) #1 - -; Function Attrs: nounwind readnone declare float @llvm.minnum.f32(float, float) #1 - -; Function Attrs: nounwind readnone declare float @llvm.maxnum.f32(float, float) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/smrd.ll =================================================================== --- test/CodeGen/AMDGPU/smrd.ll +++ test/CodeGen/AMDGPU/smrd.ll @@ -1,16 +1,16 @@ -; RUN: llc < %s -march=amdgcn -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=SIVI %s -; RUN: llc < %s -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=CI --check-prefix=GCN %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=SIVI %s +; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=SIVI %s ; SMRD load with an immediate offset. ; GCN-LABEL: {{^}}smrd0: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 -define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out + %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 1 + %tmp1 = load i32, i32 addrspace(2)* %tmp + store i32 %tmp1, i32 addrspace(1)* %out ret void } @@ -18,11 +18,11 @@ ; GCN-LABEL: {{^}}smrd1: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}} ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out + %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 255 + %tmp1 = load i32, i32 addrspace(2)* %tmp + store i32 %tmp1, i32 addrspace(1)* %out ret void } @@ -33,11 +33,11 @@ ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 ; GCN: s_endpgm -define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out + %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 256 + %tmp1 = load i32, i32 addrspace(2)* %tmp + store i32 %tmp1, i32 addrspace(1)* %out ret void } @@ -48,11 +48,11 @@ ; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b ; TODO: Add VI checks ; GCN: s_endpgm -define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out + %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 + %tmp1 = load i32, i32 addrspace(2)* %tmp + store i32 %tmp1, i32 addrspace(1)* %out ret void } @@ -62,11 +62,11 @@ ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out + %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143 + %tmp1 = load i32, i32 addrspace(2)* %tmp + store i32 %tmp1, i32 addrspace(1)* %out ret void } @@ -76,11 +76,11 @@ ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm -define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out + %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144 + %tmp1 = load i32, i32 addrspace(2)* %tmp + store i32 %tmp1, i32 addrspace(1)* %out ret void } @@ -88,12 +88,12 @@ ; GCN-LABEL: {{^}}smrd_load_const0: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10 -define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { +define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 + %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp + %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void } @@ -102,14 +102,15 @@ ; GCN-LABEL: {{^}}smrd_load_const1: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { +define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1020) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 + %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp + %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1020) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void } + ; SMRD load using the load.const intrinsic with an offset greater than the ; largets possible immediate. ; immediate offset. @@ -118,12 +119,12 @@ ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 -define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { +define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1024) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 + %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp + %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1024) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void } @@ -133,12 +134,12 @@ ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { +define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048572) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 + %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp + %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1048572) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void } @@ -148,18 +149,17 @@ ; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm -define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { +define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048576) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 + %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp + %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1048576) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void } -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #0 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 -attributes #0 = { nounwind readnone } +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/spill-m0.ll =================================================================== --- test/CodeGen/AMDGPU/spill-m0.ll +++ test/CodeGen/AMDGPU/spill-m0.ll @@ -107,7 +107,7 @@ %export = phi float [ %lds_data, %if ], [ %interp, %else ] %tmp4 = call i32 @llvm.SI.packf16(float %export, float %export) %tmp5 = bitcast i32 %tmp4 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp5, float %tmp5, float %tmp5, float %tmp5) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp5, float %tmp5, float %tmp5, float %tmp5, i1 true, i1 true) #0 ret void } @@ -205,11 +205,9 @@ ret void } -declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0 - -declare i32 @llvm.SI.packf16(float, float) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare i32 @llvm.SI.packf16(float, float) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/split-smrd.ll =================================================================== --- test/CodeGen/AMDGPU/split-smrd.ll +++ test/CodeGen/AMDGPU/split-smrd.ll @@ -1,11 +1,11 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; FIXME: Move this to sgpr-copy.ll when this is fixed on VI. ; Make sure that when we split an smrd instruction in order to move it to ; the VALU, we are also moving its users to the VALU. -; CHECK-LABEL: {{^}}split_smrd_add_worklist: -; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 +; GCN-LABEL: {{^}}split_smrd_add_worklist: +; GCN: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 define amdgpu_ps void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(2)* byval %arg) #0 { bb: %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 96) @@ -24,24 +24,20 @@ %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tmp10 = extractelement <4 x float> %tmp9, i32 0 %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef) - %tmp13 = bitcast i32 %tmp12 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %tmp13, float undef, float undef) + %tmp13 = bitcast i32 %tmp12 to <2 x half> + call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp13, <2 x half> undef, i1 true, i1 true) #0 ret void } -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - declare i32 @llvm.SI.packf16(float, float) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } !0 = !{!1, !1, i64 0, i32 1} -!1 = !{!"const", !3} -!2 = !{!1, !1, i64 0} -!3 = !{!"tbaa root"} +!1 = !{!"const", !2} +!2 = !{!"tbaa root"} Index: test/CodeGen/AMDGPU/subreg-coalescer-crash.ll =================================================================== --- test/CodeGen/AMDGPU/subreg-coalescer-crash.ll +++ test/CodeGen/AMDGPU/subreg-coalescer-crash.ll @@ -1,39 +1,37 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -o - %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s +; RUN: llc -march=amdgcn -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -; SI-LABEL:{{^}}row_filter_C1_D0: -; SI: s_endpgm -; Function Attrs: nounwind +; GCN-LABEL:{{^}}row_filter_C1_D0: define void @row_filter_C1_D0() { entry: br i1 undef, label %for.inc.1, label %do.body.preheader do.body.preheader: ; preds = %entry - %0 = insertelement <4 x i32> zeroinitializer, i32 undef, i32 1 + %tmp = insertelement <4 x i32> zeroinitializer, i32 undef, i32 1 br i1 undef, label %do.body56.1, label %do.body90 do.body90: ; preds = %do.body56.2, %do.body56.1, %do.body.preheader - %1 = phi <4 x i32> [ %6, %do.body56.2 ], [ %5, %do.body56.1 ], [ %0, %do.body.preheader ] - %2 = insertelement <4 x i32> %1, i32 undef, i32 2 - %3 = insertelement <4 x i32> %2, i32 undef, i32 3 + %tmp1 = phi <4 x i32> [ %tmp6, %do.body56.2 ], [ %tmp5, %do.body56.1 ], [ %tmp, %do.body.preheader ] + %tmp2 = insertelement <4 x i32> %tmp1, i32 undef, i32 2 + %tmp3 = insertelement <4 x i32> %tmp2, i32 undef, i32 3 br i1 undef, label %do.body124.1, label %do.body.1562.preheader do.body.1562.preheader: ; preds = %do.body124.1, %do.body90 - %storemerge = phi <4 x i32> [ %3, %do.body90 ], [ %7, %do.body124.1 ] - %4 = insertelement <4 x i32> undef, i32 undef, i32 1 + %storemerge = phi <4 x i32> [ %tmp3, %do.body90 ], [ %tmp7, %do.body124.1 ] + %tmp4 = insertelement <4 x i32> undef, i32 undef, i32 1 br label %for.inc.1 do.body56.1: ; preds = %do.body.preheader - %5 = insertelement <4 x i32> %0, i32 undef, i32 1 + %tmp5 = insertelement <4 x i32> %tmp, i32 undef, i32 1 %or.cond472.1 = or i1 undef, undef br i1 %or.cond472.1, label %do.body56.2, label %do.body90 do.body56.2: ; preds = %do.body56.1 - %6 = insertelement <4 x i32> %5, i32 undef, i32 1 + %tmp6 = insertelement <4 x i32> %tmp5, i32 undef, i32 1 br label %do.body90 do.body124.1: ; preds = %do.body90 - %7 = insertelement <4 x i32> %3, i32 undef, i32 3 + %tmp7 = insertelement <4 x i32> %tmp3, i32 undef, i32 3 br label %do.body.1562.preheader for.inc.1: ; preds = %do.body.1562.preheader, %entry @@ -42,8 +40,8 @@ unreachable } -; SI-LABEL: {{^}}foo: -; SI: s_endpgm +; GCN-LABEL: {{^}}foo: +; GCN: s_endpgm define amdgpu_ps void @foo() #0 { bb: br i1 undef, label %bb2, label %bb1 @@ -78,9 +76,9 @@ bb14: ; preds = %bb27, %bb24, %bb9 %tmp15 = phi float [ %tmp12, %bb9 ], [ undef, %bb27 ], [ 0.000000e+00, %bb24 ] %tmp16 = phi float [ %tmp11, %bb9 ], [ undef, %bb27 ], [ %tmp25, %bb24 ] - %tmp17 = fmul float 10.5, %tmp16 - %tmp18 = fmul float 11.5, %tmp15 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp18, float %tmp17, float %tmp17, float %tmp17) + %tmp17 = fmul float 1.050000e+01, %tmp16 + %tmp18 = fmul float 1.150000e+01, %tmp15 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp18, float %tmp17, float %tmp17, float %tmp17, i1 true, i1 true) #0 ret void bb23: ; preds = %bb13 @@ -97,13 +95,8 @@ br label %bb14 } -; Function Attrs: nounwind readnone +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/udiv.ll =================================================================== --- test/CodeGen/AMDGPU/udiv.ll +++ test/CodeGen/AMDGPU/udiv.ll @@ -5,17 +5,19 @@ ; FUNC-LABEL: {{^}}udiv_i32: ; EG-NOT: SETGE_INT ; EG: CF_END + +; SI: v_rcp_iflag_f32_e32 define void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1) * %in - %b = load i32, i32 addrspace(1) * %b_ptr + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr %result = udiv i32 %a, %b store i32 %result, i32 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}s_udiv_i32: - +; SI: v_rcp_iflag_f32_e32 define void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { %result = udiv i32 %a, %b store i32 %result, i32 addrspace(1)* %out @@ -30,6 +32,8 @@ ; FUNC-LABEL: {{^}}udiv_v2i32: ; EG: CF_END +; SI: v_rcp_iflag_f32_e32 +; SI: v_rcp_iflag_f32_e32 ; SI: s_endpgm define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 @@ -158,3 +162,21 @@ store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16 ret void } + +; FUNC-LABEL: {{^}}test_udiv2: +; SI: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1 +define void @test_udiv2(i32 %p) { + %i = udiv i32 %p, 2 + store volatile i32 %i, i32 addrspace(1)* undef + ret void +} + +; FUNC-LABEL: {{^}}test_udiv_3_mulhu: +; SI: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab +; SI: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}} +; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +define void @test_udiv_3_mulhu(i32 %p) { + %i = udiv i32 %p, 3 + store volatile i32 %i, i32 addrspace(1)* undef + ret void +} Index: test/CodeGen/AMDGPU/urecip.ll =================================================================== --- test/CodeGen/AMDGPU/urecip.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s - -; CHECK: v_rcp_iflag_f32_e32 - -define void @test(i32 %p, i32 %q) { - %i = udiv i32 %p, %q - %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -179,39 +179,39 @@ br i1 %tmp155, label %bb156, label %bb157 bb156: ; preds = %bb24 - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp12, float %tmp103, float %tmp102, float %tmp101) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 33, i32 0, float %tmp99, float %tmp98, float %tmp97, float %tmp95) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 34, i32 0, float %tmp94, float %tmp93, float %tmp91, float %tmp90) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 35, i32 0, float %tmp89, float %tmp87, float %tmp86, float %tmp85) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 36, i32 0, float %tmp83, float %tmp82, float %tmp81, float %tmp79) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 37, i32 0, float %tmp78, float %tmp77, float %tmp75, float %tmp74) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 38, i32 0, float %tmp73, float %tmp71, float %tmp70, float %tmp69) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 39, i32 0, float %tmp67, float %tmp66, float %tmp65, float %tmp63) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 40, i32 0, float %tmp62, float %tmp61, float %tmp59, float %tmp58) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 41, i32 0, float %tmp57, float %tmp55, float %tmp54, float %tmp53) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 42, i32 0, float %tmp51, float %tmp50, float %tmp49, float %tmp47) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 43, i32 0, float %tmp46, float %tmp45, float %tmp43, float %tmp42) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 44, i32 0, float %tmp41, float %tmp39, float %tmp38, float %tmp37) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 45, i32 0, float %tmp35, float %tmp34, float %tmp33, float %tmp31) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 46, i32 0, float %tmp30, float %tmp29, float %tmp27, float %tmp26) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 47, i32 0, float %tmp25, float %tmp28, float %tmp32, float %tmp36) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 48, i32 0, float %tmp40, float %tmp44, float %tmp48, float %tmp52) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 49, i32 0, float %tmp56, float %tmp60, float %tmp64, float %tmp68) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 50, i32 0, float %tmp72, float %tmp76, float %tmp80, float %tmp84) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 51, i32 0, float %tmp88, float %tmp92, float %tmp96, float %tmp100) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 52, i32 0, float %tmp104, float %tmp105, float %tmp106, float %tmp108) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 53, i32 0, float %tmp109, float %tmp110, float %tmp111, float %tmp112) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 54, i32 0, float %tmp113, float %tmp114, float %tmp115, float %tmp116) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 55, i32 0, float %tmp117, float %tmp118, float %tmp119, float %tmp120) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 56, i32 0, float %tmp121, float %tmp122, float %tmp123, float %tmp124) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 57, i32 0, float %tmp125, float %tmp126, float %tmp127, float %tmp128) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 58, i32 0, float %tmp129, float %tmp130, float %tmp131, float %tmp132) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 59, i32 0, float %tmp133, float %tmp134, float %tmp135, float %tmp136) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 60, i32 0, float %tmp137, float %tmp138, float %tmp139, float %tmp140) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 61, i32 0, float %tmp141, float %tmp142, float %tmp143, float %tmp144) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 62, i32 0, float %tmp145, float %tmp146, float %tmp147, float %tmp148) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 63, i32 0, float %tmp149, float %tmp150, float %tmp151, float %tmp13) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22) + call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp12, float %tmp103, float %tmp102, float %tmp101, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float %tmp99, float %tmp98, float %tmp97, float %tmp95, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 34, i32 15, float %tmp94, float %tmp93, float %tmp91, float %tmp90, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 35, i32 15, float %tmp89, float %tmp87, float %tmp86, float %tmp85, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 36, i32 15, float %tmp83, float %tmp82, float %tmp81, float %tmp79, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 37, i32 15, float %tmp78, float %tmp77, float %tmp75, float %tmp74, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 38, i32 15, float %tmp73, float %tmp71, float %tmp70, float %tmp69, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 39, i32 15, float %tmp67, float %tmp66, float %tmp65, float %tmp63, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 40, i32 15, float %tmp62, float %tmp61, float %tmp59, float %tmp58, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 41, i32 15, float %tmp57, float %tmp55, float %tmp54, float %tmp53, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 42, i32 15, float %tmp51, float %tmp50, float %tmp49, float %tmp47, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 43, i32 15, float %tmp46, float %tmp45, float %tmp43, float %tmp42, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 44, i32 15, float %tmp41, float %tmp39, float %tmp38, float %tmp37, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 45, i32 15, float %tmp35, float %tmp34, float %tmp33, float %tmp31, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 46, i32 15, float %tmp30, float %tmp29, float %tmp27, float %tmp26, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 47, i32 15, float %tmp25, float %tmp28, float %tmp32, float %tmp36, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 48, i32 15, float %tmp40, float %tmp44, float %tmp48, float %tmp52, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 49, i32 15, float %tmp56, float %tmp60, float %tmp64, float %tmp68, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 50, i32 15, float %tmp72, float %tmp76, float %tmp80, float %tmp84, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 51, i32 15, float %tmp88, float %tmp92, float %tmp96, float %tmp100, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 52, i32 15, float %tmp104, float %tmp105, float %tmp106, float %tmp108, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 53, i32 15, float %tmp109, float %tmp110, float %tmp111, float %tmp112, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 54, i32 15, float %tmp113, float %tmp114, float %tmp115, float %tmp116, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 55, i32 15, float %tmp117, float %tmp118, float %tmp119, float %tmp120, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 56, i32 15, float %tmp121, float %tmp122, float %tmp123, float %tmp124, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 57, i32 15, float %tmp125, float %tmp126, float %tmp127, float %tmp128, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 58, i32 15, float %tmp129, float %tmp130, float %tmp131, float %tmp132, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 59, i32 15, float %tmp133, float %tmp134, float %tmp135, float %tmp136, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 60, i32 15, float %tmp137, float %tmp138, float %tmp139, float %tmp140, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 61, i32 15, float %tmp141, float %tmp142, float %tmp143, float %tmp144, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 62, i32 15, float %tmp145, float %tmp146, float %tmp147, float %tmp148, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float %tmp149, float %tmp150, float %tmp151, float %tmp13, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp19, float %tmp20, float %tmp21, float %tmp22, i1 true, i1 false) #0 ret void bb157: ; preds = %bb24 @@ -482,16 +482,12 @@ br label %bb24 } -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1 -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 - attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/wait.ll =================================================================== --- test/CodeGen/AMDGPU/wait.ll +++ test/CodeGen/AMDGPU/wait.ll @@ -11,7 +11,7 @@ ; DEFAULT: exp ; DEFAULT: s_waitcnt lgkmcnt(0) ; DEFAULT: s_endpgm -define amdgpu_vs void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) { +define amdgpu_vs void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 { main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0 %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -20,8 +20,7 @@ %tmp13 = extractelement <4 x float> %tmp11, i32 1 call void @llvm.amdgcn.s.barrier() #1 %tmp14 = extractelement <4 x float> %tmp11, i32 2 -; %tmp15 = extractelement <4 x float> %tmp11, i32 3 - %tmp15 = load float, float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt + %tmp15 = load float, float addrspace(2)* %constptr, align 4 %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 1 %tmp17 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp16, !tbaa !0 %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp17, i32 0, i32 %arg6) @@ -29,8 +28,8 @@ %tmp20 = extractelement <4 x float> %tmp18, i32 1 %tmp21 = extractelement <4 x float> %tmp18, i32 2 %tmp22 = extractelement <4 x float> %tmp18, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp12, float %tmp13, float %tmp14, float %tmp15) + call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp19, float %tmp20, float %tmp21, float %tmp22, i1 false, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp12, float %tmp13, float %tmp14, float %tmp15, i1 true, i1 false) #0 ret void } @@ -44,40 +43,34 @@ ; ILPMAX: s_waitcnt vmcnt(1) ; ILPMAX: s_waitcnt vmcnt(0) ; ILPMAX: s_endpgm - -define amdgpu_vs void @main2([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)* -byval, i32 inreg, i32 inreg, i32, i32, i32, i32) { +define amdgpu_vs void @main2([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { main_body: - %11 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0 - %12 = load <16 x i8>, <16 x i8> addrspace(2)* %11, align 16, !tbaa !0 - %13 = add i32 %5, %7 - %14 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %12, i32 0, i32 %13) - %15 = extractelement <4 x float> %14, i32 0 - %16 = extractelement <4 x float> %14, i32 1 - %17 = extractelement <4 x float> %14, i32 2 - %18 = extractelement <4 x float> %14, i32 3 - %19 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 1 - %20 = load <16 x i8>, <16 x i8> addrspace(2)* %19, align 16, !tbaa !0 - %21 = add i32 %5, %7 - %22 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %20, i32 0, i32 %21) - %23 = extractelement <4 x float> %22, i32 0 - %24 = extractelement <4 x float> %22, i32 1 - %25 = extractelement <4 x float> %22, i32 2 - %26 = extractelement <4 x float> %22, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %15, float %16, float %17, float %18) - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %23, float %24, float %25, float %26) + %tmp = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 0 + %tmp11 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, align 16, !tbaa !0 + %tmp12 = add i32 %arg5, %arg7 + %tmp13 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp11, i32 0, i32 %tmp12) + %tmp14 = extractelement <4 x float> %tmp13, i32 0 + %tmp15 = extractelement <4 x float> %tmp13, i32 1 + %tmp16 = extractelement <4 x float> %tmp13, i32 2 + %tmp17 = extractelement <4 x float> %tmp13, i32 3 + %tmp18 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 1 + %tmp19 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp18, align 16, !tbaa !0 + %tmp20 = add i32 %arg5, %arg7 + %tmp21 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp19, i32 0, i32 %tmp20) + %tmp22 = extractelement <4 x float> %tmp21, i32 0 + %tmp23 = extractelement <4 x float> %tmp21, i32 1 + %tmp24 = extractelement <4 x float> %tmp21, i32 2 + %tmp25 = extractelement <4 x float> %tmp21, i32 3 + call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp14, float %tmp15, float %tmp16, float %tmp17, i1 true, i1 false) #0 + call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp22, float %tmp23, float %tmp24, float %tmp25, i1 false, i1 false) #0 ret void } - -; Function Attrs: convergent nounwind declare void @llvm.amdgcn.s.barrier() #1 - -; Function Attrs: nounwind readnone declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - +attributes #0 = { nounwind } attributes #1 = { convergent nounwind } attributes #2 = { nounwind readnone } Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -1,5 +1,5 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SI -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=VI %s ; Check that WQM isn't triggered by image load/store intrinsics. ; @@ -25,9 +25,7 @@ %c.3 = extractelement <4 x i32> %c.2, i32 0 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3 %data = load float, float addrspace(1)* %gep - - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %data, float undef, float undef, float undef) - + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %data, float undef, float undef, float undef, i1 true, i1 true) #1 ret void } @@ -500,7 +498,7 @@ ret <4 x float> %r } - +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 @@ -512,8 +510,7 @@ declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 -declare void @llvm.AMDGPU.kill(float) -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +declare void @llvm.AMDGPU.kill(float) #1 attributes #1 = { nounwind } attributes #2 = { nounwind readonly } Index: test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop.ll =================================================================== --- test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop.ll +++ test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop.ll @@ -6,46 +6,51 @@ target triple = "amdgcn--" -declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #0 -declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #2 - -define amdgpu_vs void @wrapper(i32 inreg, i32) { +define amdgpu_vs void @wrapper(i32 inreg %arg, i32 %arg1) { main_body: - %2 = add i32 %1, %0 - %3 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> undef, i32 0, i32 %2) - %4 = extractelement <4 x float> %3, i32 1 - %5 = fptosi float %4 to i32 - %6 = insertelement <2 x i32> undef, i32 %5, i32 1 + %tmp = add i32 %arg1, %arg + %tmp2 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> undef, i32 0, i32 %tmp) + %tmp3 = extractelement <4 x float> %tmp2, i32 1 + %tmp4 = fptosi float %tmp3 to i32 + %tmp5 = insertelement <2 x i32> undef, i32 %tmp4, i32 1 br label %loop11.i loop11.i: ; preds = %endif46.i, %main_body - %7 = phi i32 [ 0, %main_body ], [ %15, %endif46.i ] - %8 = icmp sgt i32 %7, 999 - br i1 %8, label %main.exit, label %if16.i + %tmp6 = phi i32 [ 0, %main_body ], [ %tmp14, %endif46.i ] + %tmp7 = icmp sgt i32 %tmp6, 999 + br i1 %tmp7, label %main.exit, label %if16.i if16.i: ; preds = %loop11.i - %9 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %6, <8 x i32> undef, i32 15, i1 true, i1 false, i1 false, i1 false) - %10 = extractelement <4 x float> %9, i32 0 - %11 = fcmp ult float 0.000000e+00, %10 - br i1 %11, label %if28.i, label %endif46.i + %tmp8 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %tmp5, <8 x i32> undef, i32 15, i1 true, i1 false, i1 false, i1 false) + %tmp9 = extractelement <4 x float> %tmp8, i32 0 + %tmp10 = fcmp ult float 0.000000e+00, %tmp9 + br i1 %tmp10, label %if28.i, label %endif46.i if28.i: ; preds = %if16.i - %12 = bitcast float %10 to i32 - %13 = shl i32 %12, 16 - %14 = bitcast i32 %13 to float + %tmp11 = bitcast float %tmp9 to i32 + %tmp12 = shl i32 %tmp11, 16 + %tmp13 = bitcast i32 %tmp12 to float br label %main.exit endif46.i: ; preds = %if16.i - %15 = add i32 %7, 1 + %tmp14 = add i32 %tmp6, 1 br label %loop11.i main.exit: ; preds = %if28.i, %loop11.i - %16 = phi float [ %14, %if28.i ], [ 0x36F0800000000000, %loop11.i ] - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %16, float 0.000000e+00, float 0.000000e+00, float 0x36A0000000000000) + %tmp15 = phi float [ %tmp13, %if28.i ], [ 0x36F0800000000000, %loop11.i ] + call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp15, float 0.000000e+00, float 0.000000e+00, float 0x36A0000000000000, i1 false, i1 false) #0 ret void } -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind readonly } -attributes #2 = { nounwind } +; Function Attrs: nounwind +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1 + +; Function Attrs: nounwind readonly +declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly }