Index: test/CodeGen/AMDGPU/coalescer-subrange-crash.ll =================================================================== --- test/CodeGen/AMDGPU/coalescer-subrange-crash.ll +++ test/CodeGen/AMDGPU/coalescer-subrange-crash.ll @@ -36,7 +36,8 @@ %tmp31 = insertelement <16 x i32> %tmp30, i32 undef, i32 6 %tmp32 = insertelement <16 x i32> %tmp31, i32 undef, i32 7 %tmp33 = insertelement <16 x i32> %tmp32, i32 undef, i32 8 - %tmp34 = call <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32> %tmp33, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %tmp33.bc = bitcast <16 x i32> %tmp33 to <16 x float> + %tmp34 = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v16f32.v8i32(<16 x float> %tmp33.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 true) %tmp35 = extractelement <4 x float> %tmp34, i32 0 %tmp36 = bitcast float %tmp24 to i32 %tmp37 = insertelement <16 x i32> , i32 %tmp36, i32 1 @@ -47,7 +48,8 @@ %tmp42 = insertelement <16 x i32> %tmp41, i32 undef, i32 6 %tmp43 = insertelement <16 x i32> %tmp42, i32 undef, i32 7 %tmp44 = insertelement <16 x i32> %tmp43, i32 undef, i32 8 - %tmp45 = call <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32> %tmp44, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %tmp44.bc = bitcast <16 x i32> %tmp44 to <16 x float> + %tmp45 = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v16f32.v8i32(<16 x float> %tmp44.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 true) %tmp46 = extractelement <4 x float> %tmp45, i32 0 %tmp47 = fmul float %tmp35, %tmp46 %tmp48 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, float %tmp47, 14 @@ -55,20 +57,10 @@ ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp49 } -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #0 +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v16f32.v8i32(<16 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0 - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } Index: test/CodeGen/AMDGPU/commute-shifts.ll =================================================================== --- test/CodeGen/AMDGPU/commute-shifts.ll +++ test/CodeGen/AMDGPU/commute-shifts.ll @@ -7,7 +7,7 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 { bb: %tmp = fptosi float %arg0 to i32 - %tmp1 = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp1 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> undef, <8 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false) %tmp2.f = extractelement <4 x float> %tmp1, i32 0 %tmp2 = bitcast float %tmp2.f to i32 %tmp3 = and i32 %tmp, 7 @@ -20,8 +20,9 @@ ret float %tmp9 } -declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 +declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } Index: test/CodeGen/AMDGPU/else.ll =================================================================== --- test/CodeGen/AMDGPU/else.ll +++ test/CodeGen/AMDGPU/else.ll @@ -1,12 +1,12 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}else_no_execfix: ; CHECK: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]], ; CHECK-NEXT: s_xor_b64 exec, exec, [[DST]] ; CHECK-NEXT: ; mask branch -define amdgpu_ps float @else_no_execfix(i32 %z, float %v) { +define amdgpu_ps float @else_no_execfix(i32 %z, float %v) #0 { main_body: %cc = icmp sgt i32 %z, 5 br i1 %cc, label %if, label %else @@ -33,7 +33,7 @@ ; CHECK-NEXT: s_and_b64 [[AND_INIT:s\[[0-9]+:[0-9]+\]]], exec, [[DST]] ; CHECK-NEXT: s_xor_b64 exec, exec, [[AND_INIT]] ; CHECK-NEXT: ; mask branch -define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) { +define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) #0 { main_body: %cc = icmp sgt i32 %z, 5 br i1 %cc, label %if, label %else @@ -44,8 +44,7 @@ else: %c = fmul float %v, 3.0 - %c.i = bitcast float %c to i32 - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %v.else = extractelement <4 x float> %tex, i32 0 br label %end @@ -55,6 +54,9 @@ ret void } -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) nounwind readnone +attributes #0 = { nounwind } +attributes #1 = { nounwind writeonly } +attributes #2 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s ; CHECK-LABEL: {{^}}test1: ; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec @@ -7,7 +7,7 @@ ; there is no WQM use and therefore llvm.amdgcn.ps.live is constant. However, ; the expectation is that the intrinsic will be used in non-trivial shaders, ; so such an optimization doesn't seem worth the effort. -define amdgpu_ps float @test1() { +define amdgpu_ps float @test1() #0 { %live = call i1 @llvm.amdgcn.ps.live() %live.32 = zext i1 %live to i32 %r = bitcast i32 %live.32 to float @@ -19,12 +19,11 @@ ; CHECK-DAG: s_wqm_b64 exec, exec ; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE]] ; CHECK: image_sample v0, [[VAR]], -define amdgpu_ps float @test2() { +define amdgpu_ps float @test2() #0 { %live = call i1 @llvm.amdgcn.ps.live() %live.32 = zext i1 %live to i32 - - %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %live.32, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - + %live.32.bc = bitcast i32 %live.32 to float + %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %live.32.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %r = extractelement <4 x float> %t, i32 0 ret float %r } @@ -35,7 +34,7 @@ ; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[LIVE]], -1 ; CHECK_DAG: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[HELPER]] ; CHECK: ; %dead -define amdgpu_ps float @test3(i32 %in) { +define amdgpu_ps float @test3(i32 %in) #0 { entry: %live = call i1 @llvm.amdgcn.ps.live() br i1 %live, label %end, label %dead @@ -46,14 +45,15 @@ end: %tc = phi i32 [ %in, %entry ], [ %tc.dead, %dead ] - %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %tc, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - + %tc.bc = bitcast i32 %tc to float + %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %tc.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %r = extractelement <4 x float> %t, i32 0 ret float %r } -declare i1 @llvm.amdgcn.ps.live() #0 - -declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare i1 @llvm.amdgcn.ps.live() #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -attributes #0 = { nounwind readnone } +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } Index: test/CodeGen/AMDGPU/sgpr-copy.ll =================================================================== --- test/CodeGen/AMDGPU/sgpr-copy.ll +++ test/CodeGen/AMDGPU/sgpr-copy.ll @@ -86,8 +86,9 @@ %tmp47 = insertelement <2 x i32> undef, i32 %tmp45, i32 0 %tmp48 = insertelement <2 x i32> %tmp47, i32 %tmp46, i32 1 %tmp39.bc = bitcast <16 x i8> %tmp39 to <4 x i32> - %tmp49 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp48, <8 x i32> %tmp37, <4 x i32> %tmp39.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp50 = extractelement <4 x float> %tmp49, i32 2 + %a.bc.i = bitcast <2 x i32> %tmp48 to <2 x float> + %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp37, <4 x i32> %tmp39.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp50 = extractelement <4 x float> %tmp1, i32 2 %tmp51 = call float @llvm.fabs.f32(float %tmp50) %tmp52 = fmul float %p2.i18, %p2.i18 %tmp53 = fmul float %p2.i12, %p2.i12 @@ -239,17 +240,17 @@ br i1 %tmp27, label %if, label %else if: ; preds = %entry - %val.if = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %val.if.0 = extractelement <4 x float> %val.if, i32 0 - %val.if.1 = extractelement <4 x float> %val.if, i32 1 - %val.if.2 = extractelement <4 x float> %val.if, i32 2 + %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> , <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %val.if.0 = extractelement <4 x float> %tmp1, i32 0 + %val.if.1 = extractelement <4 x float> %tmp1, i32 1 + %val.if.2 = extractelement <4 x float> %tmp1, i32 2 br label %endif else: ; preds = %entry - %val.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %val.else.0 = extractelement <4 x float> %val.else, i32 0 - %val.else.1 = extractelement <4 x float> %val.else, i32 1 - %val.else.2 = extractelement <4 x float> %val.else, i32 2 + %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> , <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %val.else.0 = extractelement <4 x float> %tmp2, i32 0 + %val.else.1 = extractelement <4 x float> %tmp2, i32 1 + %val.else.2 = extractelement <4 x float> %tmp2, i32 2 br label %endif endif: ; preds = %else, %if @@ -356,7 +357,8 @@ %tmp54 = insertelement <2 x i32> undef, i32 %tmp52, i32 0 %tmp55 = insertelement <2 x i32> %tmp54, i32 %tmp53, i32 1 %tmp56 = bitcast <8 x i32> %tmp26 to <8 x i32> - %tmp58 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp55, <8 x i32> %tmp56, <4 x i32> %tmp28, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %a.bc.i = bitcast <2 x i32> %tmp55 to <2 x float> + %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp56, <4 x i32> %tmp28, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) br label %bb71 bb80: ; preds = %bb @@ -366,11 +368,12 @@ %tmp83 = insertelement <2 x i32> undef, i32 %tmp81, i32 0 %tmp84 = insertelement <2 x i32> %tmp83, i32 %tmp82.2, i32 1 %tmp85 = bitcast <8 x i32> %tmp26 to <8 x i32> - %tmp87 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp84, <8 x i32> %tmp85, <4 x i32> %tmp28, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %a.bc.i1 = bitcast <2 x i32> %tmp84 to <2 x float> + %tmp3 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i1, <8 x i32> %tmp85, <4 x i32> %tmp28, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) br label %bb71 bb71: ; preds = %bb80, %bb38 - %tmp72 = phi <4 x float> [ %tmp58, %bb38 ], [ %tmp87, %bb80 ] + %tmp72 = phi <4 x float> [ %tmp2, %bb38 ], [ %tmp3, %bb80 ] %tmp88 = extractelement <4 x float> %tmp72, i32 0 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp88, float %tmp88, float %tmp88, float %tmp88, i1 true, i1 true) #0 ret void @@ -384,8 +387,8 @@ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i32 0, i32 %tid %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0 - %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp10 = extractelement <4 x float> %tmp9, i32 0 + %tmp = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> , <8 x i32> %tmp8, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp10 = extractelement <4 x float> %tmp, i32 0 %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp10) call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0 ret void @@ -399,8 +402,8 @@ %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp7 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i32 0, i32 %tid %tmp8 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp7, align 16, !tbaa !0 - %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> undef, <4 x i32> %tmp8, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp10 = extractelement <4 x float> %tmp9, i32 0 + %tmp = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> , <8 x i32> undef, <4 x i32> %tmp8, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp10 = extractelement <4 x float> %tmp, i32 0 %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef) call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0 ret void @@ -416,12 +419,12 @@ declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 - -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 declare float @llvm.SI.load.const(<16 x i8>, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } !0 = !{!1, !1, i64 0, i32 1} !1 = !{!"const", !2} Index: test/CodeGen/AMDGPU/si-lod-bias.ll =================================================================== --- test/CodeGen/AMDGPU/si-lod-bias.ll +++ test/CodeGen/AMDGPU/si-lod-bias.ll @@ -35,7 +35,8 @@ %tmp33 = insertelement <4 x i32> %tmp32, i32 %tmp30, i32 2 %tmp34 = insertelement <4 x i32> %tmp33, i32 undef, i32 3 %tmp25.bc = bitcast <16 x i8> %tmp25 to <4 x i32> - %tmp35 = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> %tmp34, <8 x i32> %tmp23, <4 x i32> %tmp25.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp34.bc = bitcast <4 x i32> %tmp34 to <4 x float> + %tmp35 = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> %tmp34.bc, <8 x i32> %tmp23, <4 x i32> %tmp25.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp36 = extractelement <4 x float> %tmp35, i32 0 %tmp37 = extractelement <4 x float> %tmp35, i32 1 %tmp38 = extractelement <4 x float> %tmp35, i32 2 @@ -47,12 +48,12 @@ declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 - +declare <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 declare float @llvm.SI.load.const(<16 x i8>, i32) #1 -declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } !0 = !{!1, !1, i64 0, i32 1} !1 = !{!"const", !2} Index: test/CodeGen/AMDGPU/si-scheduler.ll =================================================================== --- test/CodeGen/AMDGPU/si-scheduler.ll +++ test/CodeGen/AMDGPU/si-scheduler.ll @@ -40,7 +40,9 @@ %tmp30 = insertelement <2 x i32> %tmp29, i32 %tmp28, i32 1 %tmp22.bc = bitcast <32 x i8> %tmp22 to <8 x i32> %tmp24.bc = bitcast <16 x i8> %tmp24 to <4 x i32> - %tmp31 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp30, <8 x i32> %tmp22.bc, <4 x i32> %tmp24.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp30.bc = bitcast <2 x i32> %tmp30 to <2 x float> + %tmp31 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp30.bc, <8 x i32> %tmp22.bc, <4 x i32> %tmp24.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp32 = extractelement <4 x float> %tmp31, i32 0 %tmp33 = extractelement <4 x float> %tmp31, i32 1 %tmp34 = extractelement <4 x float> %tmp31, i32 2 @@ -54,12 +56,12 @@ declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 - -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } !0 = !{!1, !1, i64 0, i32 1} !1 = !{!"const", !2} Index: test/CodeGen/AMDGPU/si-sgpr-spill.ll =================================================================== --- test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -361,7 +361,8 @@ %tmp240 = insertelement <8 x i32> %tmp239, i32 %tmp238, i32 5 %tmp241 = insertelement <8 x i32> %tmp240, i32 undef, i32 6 %tmp242 = insertelement <8 x i32> %tmp241, i32 undef, i32 7 - %tmp243 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp242, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp242.bc = bitcast <8 x i32> %tmp242 to <8 x float> + %tmp243 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp242.bc, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp244 = extractelement <4 x float> %tmp243, i32 3 %tmp245 = fcmp oge float %temp30.0, %tmp244 %tmp246 = sext i1 %tmp245 to i32 @@ -407,7 +408,8 @@ %tmp275 = insertelement <8 x i32> %tmp274, i32 undef, i32 6 %tmp276 = insertelement <8 x i32> %tmp275, i32 undef, i32 7 %tmp67.bc = bitcast <16 x i8> %tmp67 to <4 x i32> - %tmp277 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp276, <8 x i32> %tmp65, <4 x i32> %tmp67.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp276.bc = bitcast <8 x i32> %tmp276 to <8 x float> + %tmp277 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp276.bc, <8 x i32> %tmp65, <4 x i32> %tmp67.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp278 = extractelement <4 x float> %tmp277, i32 0 %tmp279 = extractelement <4 x float> %tmp277, i32 1 %tmp280 = extractelement <4 x float> %tmp277, i32 2 @@ -428,7 +430,8 @@ %tmp295 = insertelement <8 x i32> %tmp294, i32 undef, i32 6 %tmp296 = insertelement <8 x i32> %tmp295, i32 undef, i32 7 %tmp83.bc = bitcast <16 x i8> %tmp83 to <4 x i32> - %tmp297 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp296, <8 x i32> %tmp81, <4 x i32> %tmp83.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp296.bc = bitcast <8 x i32> %tmp296 to <8 x float> + %tmp297 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp296.bc, <8 x i32> %tmp81, <4 x i32> %tmp83.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp298 = extractelement <4 x float> %tmp297, i32 0 %tmp299 = extractelement <4 x float> %tmp297, i32 1 %tmp300 = extractelement <4 x float> %tmp297, i32 2 @@ -447,7 +450,8 @@ %tmp313 = insertelement <8 x i32> %tmp312, i32 undef, i32 6 %tmp314 = insertelement <8 x i32> %tmp313, i32 undef, i32 7 %tmp79.bc = bitcast <16 x i8> %tmp79 to <4 x i32> - %tmp315 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp314, <8 x i32> %tmp77, <4 x i32> %tmp79.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp314.bc = bitcast <8 x i32> %tmp314 to <8 x float> + %tmp315 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp314.bc, <8 x i32> %tmp77, <4 x i32> %tmp79.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp316 = extractelement <4 x float> %tmp315, i32 0 %tmp317 = extractelement <4 x float> %tmp315, i32 1 %tmp318 = extractelement <4 x float> %tmp315, i32 2 @@ -477,7 +481,8 @@ %tmp342 = insertelement <8 x i32> %tmp341, i32 %tmp336, i32 5 %tmp343 = insertelement <8 x i32> %tmp342, i32 undef, i32 6 %tmp344 = insertelement <8 x i32> %tmp343, i32 undef, i32 7 - %tmp345 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp344, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp344.bc = bitcast <8 x i32> %tmp344 to <8 x float> + %tmp345 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp344.bc, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp346 = extractelement <4 x float> %tmp345, i32 0 %tmp347 = extractelement <4 x float> %tmp345, i32 1 %tmp348 = extractelement <4 x float> %tmp345, i32 2 @@ -508,7 +513,8 @@ %tmp373 = insertelement <8 x i32> %tmp372, i32 undef, i32 6 %tmp374 = insertelement <8 x i32> %tmp373, i32 undef, i32 7 %tmp71.bc = bitcast <16 x i8> %tmp71 to <4 x i32> - %tmp375 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp374, <8 x i32> %tmp69, <4 x i32> %tmp71.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp374.bc = bitcast <8 x i32> %tmp374 to <8 x float> + %tmp375 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp374.bc, <8 x i32> %tmp69, <4 x i32> %tmp71.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp376 = extractelement <4 x float> %tmp375, i32 0 %tmp377 = extractelement <4 x float> %tmp375, i32 1 %tmp378 = extractelement <4 x float> %tmp375, i32 2 @@ -563,7 +569,8 @@ %tmp427 = insertelement <8 x i32> %tmp426, i32 undef, i32 6 %tmp428 = insertelement <8 x i32> %tmp427, i32 undef, i32 7 %tmp87.bc = bitcast <16 x i8> %tmp87 to <4 x i32> - %tmp429 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp428, <8 x i32> %tmp85, <4 x i32> %tmp87.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp428.bc = bitcast <8 x i32> %tmp428 to <8 x float> + %tmp429 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp428.bc, <8 x i32> %tmp85, <4 x i32> %tmp87.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp430 = extractelement <4 x float> %tmp429, i32 0 %tmp431 = extractelement <4 x float> %tmp429, i32 1 %tmp432 = extractelement <4 x float> %tmp429, i32 2 @@ -615,7 +622,8 @@ %tmp468 = insertelement <4 x i32> %tmp467, i32 %tmp465, i32 2 %tmp469 = insertelement <4 x i32> %tmp468, i32 undef, i32 3 %tmp91.bc = bitcast <16 x i8> %tmp91 to <4 x i32> - %tmp470 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tmp469, <8 x i32> %tmp89, <4 x i32> %tmp91.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp469.bc = bitcast <4 x i32> %tmp469 to <4 x float> + %tmp470 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tmp469.bc, <8 x i32> %tmp89, <4 x i32> %tmp91.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %tmp471 = extractelement <4 x float> %tmp470, i32 0 %tmp472 = extractelement <4 x float> %tmp470, i32 1 %tmp473 = extractelement <4 x float> %tmp470, i32 2 @@ -717,7 +725,8 @@ %tmp569 = insertelement <8 x i32> %tmp568, i32 undef, i32 6 %tmp570 = insertelement <8 x i32> %tmp569, i32 undef, i32 7 %tmp75.bc = bitcast <16 x i8> %tmp75 to <4 x i32> - %tmp571 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp570, <8 x i32> %tmp73, <4 x i32> %tmp75.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp570.bc = bitcast <8 x i32> %tmp570 to <8 x float> + %tmp571 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp570.bc, <8 x i32> %tmp73, <4 x i32> %tmp75.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp572 = extractelement <4 x float> %tmp571, i32 0 %tmp573 = extractelement <4 x float> %tmp571, i32 1 %tmp574 = extractelement <4 x float> %tmp571, i32 2 @@ -739,7 +748,8 @@ %tmp588 = insertelement <8 x i32> %tmp587, i32 %tmp586, i32 5 %tmp589 = insertelement <8 x i32> %tmp588, i32 undef, i32 6 %tmp590 = insertelement <8 x i32> %tmp589, i32 undef, i32 7 - %tmp591 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp590, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp590.bc = bitcast <8 x i32> %tmp590 to <8 x float> + %tmp591 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp590.bc, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp592 = extractelement <4 x float> %tmp591, i32 3 %tmp593 = fcmp oge float %temp30.1, %tmp592 %tmp594 = sext i1 %tmp593 to i32 @@ -762,7 +772,7 @@ br label %LOOP65 } -define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) { +define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { main_body: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0 %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -1129,7 +1139,8 @@ %tmp223 = insertelement <2 x i32> undef, i32 %tmp221, i32 0 %tmp224 = insertelement <2 x i32> %tmp223, i32 %tmp222, i32 1 %tmp132.bc = bitcast <16 x i8> %tmp132 to <4 x i32> - %tmp225 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp224, <8 x i32> %tmp130, <4 x i32> %tmp132.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp224.bc = bitcast <2 x i32> %tmp224 to <2 x float> + %tmp225 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp224.bc, <8 x i32> %tmp130, <4 x i32> %tmp132.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp226 = extractelement <4 x float> %tmp225, i32 0 %tmp227 = extractelement <4 x float> %tmp225, i32 1 %tmp228 = extractelement <4 x float> %tmp225, i32 2 @@ -1202,7 +1213,8 @@ %tmp280 = insertelement <4 x i32> %tmp279, i32 0, i32 2 %tmp281 = insertelement <4 x i32> %tmp280, i32 undef, i32 3 %tmp148.bc = bitcast <16 x i8> %tmp148 to <4 x i32> - %tmp282 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp281, <8 x i32> %tmp146, <4 x i32> %tmp148.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp281.bc = bitcast <4 x i32> %tmp281 to <4 x float> + %tmp282 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp281.bc, <8 x i32> %tmp146, <4 x i32> %tmp148.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp283 = extractelement <4 x float> %tmp282, i32 3 %tmp284 = fadd float %temp168.0, %tmp273 %tmp285 = fadd float %temp169.0, %tmp274 @@ -1266,11 +1278,12 @@ %tmp340 = insertelement <2 x i32> undef, i32 %tmp338, i32 0 %tmp341 = insertelement <2 x i32> %tmp340, i32 %tmp339, i32 1 %tmp136.bc = bitcast <16 x i8> %tmp136 to <4 x i32> - %tmp342 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp341, <8 x i32> %tmp134, <4 x i32> %tmp136.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp343 = extractelement <4 x float> %tmp342, i32 0 - %tmp344 = extractelement <4 x float> %tmp342, i32 1 - %tmp345 = extractelement <4 x float> %tmp342, i32 2 - %tmp346 = extractelement <4 x float> %tmp342, i32 3 + %a.bc.i = bitcast <2 x i32> %tmp341 to <2 x float> + %tmp0 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp134, <4 x i32> %tmp136.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp343 = extractelement <4 x float> %tmp0, i32 0 + %tmp344 = extractelement <4 x float> %tmp0, i32 1 + %tmp345 = extractelement <4 x float> %tmp0, i32 2 + %tmp346 = extractelement <4 x float> %tmp0, i32 3 %tmp347 = fmul float %tmp343, %tmp22 %tmp348 = fmul float %tmp344, %tmp23 %tmp349 = fmul float %tmp345, %tmp24 @@ -1299,8 +1312,9 @@ %tmp360 = insertelement <2 x i32> undef, i32 %tmp358, i32 0 %tmp361 = insertelement <2 x i32> %tmp360, i32 %tmp359, i32 1 %tmp152.bc = bitcast <16 x i8> %tmp152 to <4 x i32> - %tmp362 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp361, <8 x i32> %tmp150, <4 x i32> %tmp152.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp363 = extractelement <4 x float> %tmp362, i32 2 + %a.bc.i3 = bitcast <2 x i32> %tmp361 to <2 x float> + %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i3, <8 x i32> %tmp150, <4 x i32> %tmp152.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp363 = extractelement <4 x float> %tmp1, i32 2 %tmp364 = fmul float %result.i40, %result.i %tmp365 = fmul float %result.i36, %result.i44 %tmp366 = fmul float %result.i32, %result.i42 @@ -1310,11 +1324,12 @@ %tmp370 = insertelement <2 x i32> undef, i32 %tmp368, i32 0 %tmp371 = insertelement <2 x i32> %tmp370, i32 %tmp369, i32 1 %tmp140.bc = bitcast <16 x i8> %tmp140 to <4 x i32> - %tmp372 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp371, <8 x i32> %tmp138, <4 x i32> %tmp140.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp373 = extractelement <4 x float> %tmp372, i32 0 - %tmp374 = extractelement <4 x float> %tmp372, i32 1 - %tmp375 = extractelement <4 x float> %tmp372, i32 2 - %tmp376 = extractelement <4 x float> %tmp372, i32 3 + %a.bc.i2 = bitcast <2 x i32> %tmp371 to <2 x float> + %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i2, <8 x i32> %tmp138, <4 x i32> %tmp140.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp373 = extractelement <4 x float> %tmp2, i32 0 + %tmp374 = extractelement <4 x float> %tmp2, i32 1 + %tmp375 = extractelement <4 x float> %tmp2, i32 2 + %tmp376 = extractelement <4 x float> %tmp2, i32 3 %tmp377 = fcmp olt float 0.000000e+00, %tmp375 %tmp378 = sext i1 %tmp377 to i32 %tmp379 = bitcast i32 %tmp378 to float @@ -1327,11 +1342,12 @@ %tmp384 = insertelement <2 x i32> undef, i32 %tmp382, i32 0 %tmp385 = insertelement <2 x i32> %tmp384, i32 %tmp383, i32 1 %tmp144.bc = bitcast <16 x i8> %tmp144 to <4 x i32> - %tmp386 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp385, <8 x i32> %tmp142, <4 x i32> %tmp144.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp387 = extractelement <4 x float> %tmp386, i32 0 - %tmp388 = extractelement <4 x float> %tmp386, i32 1 - %tmp389 = extractelement <4 x float> %tmp386, i32 2 - %tmp390 = extractelement <4 x float> %tmp386, i32 3 + %a.bc.i1 = bitcast <2 x i32> %tmp385 to <2 x float> + %tmp3 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i1, <8 x i32> %tmp142, <4 x i32> %tmp144.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp387 = extractelement <4 x float> %tmp3, i32 0 + %tmp388 = extractelement <4 x float> %tmp3, i32 1 + %tmp389 = extractelement <4 x float> %tmp3, i32 2 + %tmp390 = extractelement <4 x float> %tmp3, i32 3 %tmp391 = fcmp olt float 0.000000e+00, %tmp389 %tmp392 = sext i1 %tmp391 to i32 %tmp393 = bitcast i32 %tmp392 to float @@ -1425,7 +1441,8 @@ %tmp468 = insertelement <2 x i32> undef, i32 %tmp466, i32 0 %tmp469 = insertelement <2 x i32> %tmp468, i32 %tmp467, i32 1 %tmp160.bc = bitcast <16 x i8> %tmp160 to <4 x i32> - %tmp470 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp469, <8 x i32> %tmp158, <4 x i32> %tmp160.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp469.bc = bitcast <2 x i32> %tmp469 to <2 x float> + %tmp470 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp469.bc, <8 x i32> %tmp158, <4 x i32> %tmp160.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp471 = extractelement <4 x float> %tmp470, i32 0 %tmp472 = extractelement <4 x float> %tmp470, i32 1 %tmp473 = extractelement <4 x float> %tmp470, i32 2 @@ -1443,7 +1460,8 @@ %tmp485 = insertelement <2 x i32> undef, i32 %tmp483, i32 0 %tmp486 = insertelement <2 x i32> %tmp485, i32 %tmp484, i32 1 %tmp156.bc = bitcast <16 x i8> %tmp156 to <4 x i32> - %tmp487 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp486, <8 x i32> %tmp154, <4 x i32> %tmp156.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp486.bc = bitcast <2 x i32> %tmp486 to <2 x float> + %tmp487 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp486.bc, <8 x i32> %tmp154, <4 x i32> %tmp156.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp488 = extractelement <4 x float> %tmp487, i32 0 %tmp489 = extractelement <4 x float> %tmp487, i32 1 %tmp490 = extractelement <4 x float> %tmp487, i32 2 @@ -1651,7 +1669,8 @@ %tmp658 = insertelement <4 x i32> %tmp657, i32 %tmp655, i32 2 %tmp659 = insertelement <4 x i32> %tmp658, i32 undef, i32 3 %tmp128.bc = bitcast <16 x i8> %tmp128 to <4 x i32> - %tmp660 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp659, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp659.bc = bitcast <4 x i32> %tmp659 to <4 x float> + %tmp660 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp659.bc, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp661 = extractelement <4 x float> %tmp660, i32 0 %tmp662 = extractelement <4 x float> %tmp660, i32 1 %tmp663 = bitcast float %tmp646 to i32 @@ -1661,7 +1680,8 @@ %tmp667 = insertelement <4 x i32> %tmp666, i32 %tmp664, i32 1 %tmp668 = insertelement <4 x i32> %tmp667, i32 %tmp665, i32 2 %tmp669 = insertelement <4 x i32> %tmp668, i32 undef, i32 3 - %tmp670 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp669, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp669.bc = bitcast <4 x i32> %tmp669 to <4 x float> + %tmp670 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp669.bc, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp671 = extractelement <4 x float> %tmp670, i32 0 %tmp672 = extractelement <4 x float> %tmp670, i32 1 %tmp673 = fsub float -0.000000e+00, %tmp662 @@ -1839,15 +1859,15 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 - -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 declare float @llvm.SI.load.const(<16 x i8>, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } !0 = !{!1, !1, i64 0, i32 1} !1 = !{!"const", !2} Index: test/CodeGen/AMDGPU/skip-if-dead.ll =================================================================== --- test/CodeGen/AMDGPU/skip-if-dead.ll +++ test/CodeGen/AMDGPU/skip-if-dead.ll @@ -357,7 +357,7 @@ ; CHECK: [[END]]: ; CHECK: s_or_b64 exec, exec ; CHECK: s_endpgm -define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, <4 x i32> %arg2) #0 { +define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, <4 x float> %arg2) #0 { bb: %tmp = fcmp ult float %arg1, 0.000000e+00 br i1 %tmp, label %bb3, label %bb4 @@ -367,7 +367,7 @@ br label %bb4 bb4: ; preds = %bb3, %bb - %tmp5 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %arg2, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp5 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %arg2, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp6 = extractelement <4 x float> %tmp5, i32 0 %tmp7 = fcmp une float %tmp6, 0.000000e+00 br i1 %tmp7, label %bb8, label %bb9 @@ -380,9 +380,8 @@ ret void } +declare <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 declare void @llvm.AMDGPU.kill(float) #0 -declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } +attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/split-smrd.ll =================================================================== --- test/CodeGen/AMDGPU/split-smrd.ll +++ test/CodeGen/AMDGPU/split-smrd.ll @@ -21,21 +21,21 @@ %tmp6 = sext i32 %tmp5 to i64 %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i64 0, i64 %tmp6 %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0 - %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp9 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> , <8 x i32> %tmp8, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp10 = extractelement <4 x float> %tmp9, i32 0 %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef) call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0 ret void } -declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 - +declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 declare float @llvm.SI.load.const(<16 x i8>, i32) #1 -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } !0 = !{!1, !1, i64 0, i32 1} !1 = !{!"const", !2} Index: test/CodeGen/AMDGPU/subreg-coalescer-crash.ll =================================================================== --- test/CodeGen/AMDGPU/subreg-coalescer-crash.ll +++ test/CodeGen/AMDGPU/subreg-coalescer-crash.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL:{{^}}row_filter_C1_D0: -define void @row_filter_C1_D0() { +define void @row_filter_C1_D0() #0 { entry: br i1 undef, label %for.inc.1, label %do.body.preheader @@ -65,7 +65,7 @@ br label %bb4 bb9: ; preds = %bb2 - %tmp10 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp10 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp11 = extractelement <4 x float> %tmp10, i32 1 %tmp12 = extractelement <4 x float> %tmp10, i32 3 br label %bb14 @@ -95,8 +95,9 @@ br label %bb14 } + declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } +attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/undefined-subreg-liverange.ll =================================================================== --- test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -35,7 +35,8 @@ %tmp1 = load volatile i32, i32 addrspace(1)* undef, align 4 %tmp2 = insertelement <4 x i32> undef, i32 %tmp1, i32 0 %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1 - %tmp4 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tmp3, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp3.cast = bitcast <4 x i32> %tmp3 to <4 x float> + %tmp4 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tmp3.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp5 = extractelement <4 x float> %tmp4, i32 0 %tmp6 = fmul float %tmp5, undef %tmp7 = fadd float %tmp6, %tmp6 @@ -83,8 +84,7 @@ ret void } -declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare float @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } +attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/unigine-liveness-crash.ll =================================================================== --- test/CodeGen/AMDGPU/unigine-liveness-crash.ll +++ test/CodeGen/AMDGPU/unigine-liveness-crash.ll @@ -17,7 +17,8 @@ %j.f.i = bitcast i32 %j.i to float %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 3, i32 4, i32 %arg6) #2 %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 3, i32 4, i32 %arg6) #2 - %tmp23 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp23 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp24 = extractelement <4 x float> %tmp23, i32 3 %tmp25 = fmul float %tmp24, undef %tmp26 = fmul float undef, %p2.i @@ -26,14 +27,15 @@ %tmp29 = insertelement <4 x i32> undef, i32 %tmp28, i32 0 %tmp30 = insertelement <4 x i32> %tmp29, i32 0, i32 1 %tmp31 = insertelement <4 x i32> %tmp30, i32 undef, i32 2 - %tmp32 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp31, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp31.cast = bitcast <4 x i32> %tmp31 to <4 x float> + %tmp32 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp31.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp33 = extractelement <4 x float> %tmp32, i32 0 %tmp34 = fadd float undef, %tmp33 %tmp35 = fadd float %tmp34, undef %tmp36 = fadd float %tmp35, undef %tmp37 = fadd float %tmp36, undef %tmp38 = fadd float %tmp37, undef - %tmp39 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp39 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp40 = extractelement <4 x float> %tmp39, i32 0 %tmp41 = extractelement <4 x float> %tmp39, i32 1 %tmp42 = extractelement <4 x float> %tmp39, i32 2 @@ -50,7 +52,8 @@ %tmp53 = insertelement <4 x i32> undef, i32 %tmp50, i32 0 %tmp54 = insertelement <4 x i32> %tmp53, i32 %tmp51, i32 1 %tmp55 = insertelement <4 x i32> %tmp54, i32 %tmp52, i32 2 - %tmp56 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp55, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp55.cast = bitcast <4 x i32> %tmp55 to <4 x float> + %tmp56 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp55.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp57 = extractelement <4 x float> %tmp56, i32 0 %tmp58 = fadd float %tmp38, %tmp57 %tmp59 = fadd float undef, %tmp46 @@ -59,7 +62,8 @@ %tmp62 = bitcast float %tmp60 to i32 %tmp63 = insertelement <4 x i32> undef, i32 %tmp61, i32 1 %tmp64 = insertelement <4 x i32> %tmp63, i32 %tmp62, i32 2 - %tmp65 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp64, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp64.cast = bitcast <4 x i32> %tmp64 to <4 x float> + %tmp65 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp64.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp66 = extractelement <4 x float> %tmp65, i32 0 %tmp67 = fadd float %tmp58, %tmp66 %tmp68 = fmul float %tmp67, 1.250000e-01 @@ -99,33 +103,22 @@ ENDIF28: ; preds = %LOOP %tmp85 = insertelement <4 x i32> %tmp72, i32 undef, i32 1 %tmp86 = insertelement <4 x i32> %tmp85, i32 undef, i32 2 - %tmp87 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp86, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp86.cast = bitcast <4 x i32> %tmp86 to <4 x float> + %tmp87 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp86.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp88 = extractelement <4 x float> %tmp87, i32 0 %tmp89 = fadd float undef, %tmp88 br label %LOOP } -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 - -; Function Attrs: nounwind readnone declare float @llvm.minnum.f32(float, float) #1 - -; Function Attrs: nounwind readnone declare float @llvm.maxnum.f32(float, float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 attributes #0 = { nounwind "InitialPSInputAddr"="36983" "target-cpu"="tonga" } attributes #1 = { nounwind readnone } -attributes #2 = { nounwind } +attributes #2 = { nounwind readonly } +attributes #3 = { nounwind } Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -18,9 +18,9 @@ ;CHECK-NEXT: ; %main_body ;CHECK-NEXT: s_wqm_b64 exec, exec ;CHECK-NOT: exec -define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) { +define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x float> %c) { main_body: - %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %c.1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %c.2 = bitcast <4 x float> %c.1 to <4 x i32> %c.3 = extractelement <4 x i32> %c.2, i32 0 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3 @@ -40,9 +40,9 @@ ;CHECK: store ;CHECK-NOT: exec ;CHECK: .size test3 -define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x i32> %c) { +define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x float> %c) { main_body: - %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %tex.1 = bitcast <4 x float> %tex to <4 x i32> %tex.2 = extractelement <4 x i32> %tex.1, i32 0 @@ -68,10 +68,9 @@ %c.1 = mul i32 %c, %d call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0) - - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %c.1.bc = bitcast i32 %c.1 to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 ret <4 x float> %dtex } @@ -99,9 +98,9 @@ br i1 %cmp, label %IF, label %ELSE IF: - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %data.if = extractelement <4 x float> %dtex, i32 0 br label %END @@ -141,9 +140,9 @@ br i1 %cmp, label %ELSE, label %IF IF: - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %data.if = extractelement <4 x float> %dtex, i32 0 br label %END @@ -198,7 +197,8 @@ END: %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ] - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord.END, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %coord.END.bc = bitcast i32 %coord.END to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 ret <4 x float> %tex } @@ -213,13 +213,11 @@ ;CHECK: image_sample ;CHECK: v_cmp ;CHECK: store -define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, i32 %coord) { +define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) { main_body: - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %dtex.1 = extractelement <4 x float> %dtex, i32 0 - call void @llvm.amdgcn.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) %cc = fcmp ogt float %dtex.1, 0.0 @@ -252,7 +250,7 @@ ;CHECK: %END ;CHECK: image_sample ;CHECK: image_sample -define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) { +define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) { main_body: %cond = icmp eq i32 %y, 0 br i1 %cond, label %IF, label %END @@ -263,9 +261,8 @@ br label %END END: - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 ret <4 x float> %dtex } @@ -284,10 +281,9 @@ ;CHECK: buffer_store_dword ;CHECK: s_mov_b64 exec, [[SAVE]] ;CHECK: image_sample -define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) { +define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) { main_body: - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %idx.0 = extractelement <2 x i32> %idx, i32 0 %data.0 = extractelement <2 x float> %data, i32 0 call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0) @@ -297,10 +293,8 @@ %idx.1 = extractelement <2 x i32> %idx, i32 1 %data.1 = extractelement <2 x float> %data, i32 1 call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0) - - %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex2.1 = bitcast <4 x float> %tex2 to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex2.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %out = fadd <4 x float> %tex, %dtex ret <4 x float> %out @@ -318,11 +312,10 @@ ; CHECK: buffer_store_dword ; CHECK-NOT: wqm ; CHECK: v_cmpx_ -define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) { +define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { main_body: - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) @@ -373,8 +366,7 @@ br i1 %cc, label %break, label %body body: - %c.i = bitcast <4 x float> %c.iv to <4 x i32> - %c.next = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %c.next = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c.iv, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %ctr.next = fadd float %ctr.iv, 2.0 br label %loop @@ -414,9 +406,8 @@ %c.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 %idx %c = load i32, i32* %c.gep, align 4 - - %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - + %c.bc = bitcast i32 %c to float + %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) ret void @@ -434,9 +425,8 @@ ; CHECK: s_and_b64 exec, exec, [[LIVE]] ; CHECK-NOT: exec define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { - %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.i = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 ret <4 x float> %dtex } @@ -448,10 +438,8 @@ ; CHECK-NOT: exec define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind { entry: - %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.i = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %cc = icmp sgt i32 %c, 0 br i1 %cc, label %if, label %else @@ -483,33 +471,28 @@ br i1 %cc, label %if, label %else if: - %r.if = call <4 x float> @llvm.SI.image.sample.i32(i32 0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r.if = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float 0.0, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 br label %end else: - %r.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r.else = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> , <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 br label %end end: %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ] - call void @llvm.amdgcn.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) - ret <4 x float> %r } declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1 -declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 - -declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2 - -declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 -declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 - +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #2 +declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #3 +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3 declare void @llvm.AMDGPU.kill(float) #1 attributes #1 = { nounwind }