Index: test/CodeGen/AMDGPU/bug-vopc-commute.ll =================================================================== --- test/CodeGen/AMDGPU/bug-vopc-commute.ll +++ test/CodeGen/AMDGPU/bug-vopc-commute.ll @@ -8,8 +8,8 @@ ; of which were in SGPRs. define amdgpu_vs float @main(i32 %v) { main_body: - %d1 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 960) - %d2 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 976) + %d1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 960, i32 0) + %d2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 976, i32 0) br i1 undef, label %ENDIF56, label %IF57 IF57: ; preds = %ENDIF @@ -41,7 +41,7 @@ } ; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #0 +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #0 attributes #0 = { nounwind readnone } attributes #1 = { readnone } Index: test/CodeGen/AMDGPU/scheduler-subrange-crash.ll =================================================================== --- test/CodeGen/AMDGPU/scheduler-subrange-crash.ll +++ test/CodeGen/AMDGPU/scheduler-subrange-crash.ll @@ -15,9 +15,9 @@ define amdgpu_gs void @main(i32 inreg %arg) #0 { main_body: - %tmp = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 20) - %tmp1 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 24) - %tmp2 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 48) + %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 20, i32 0) + %tmp1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 24, i32 0) + %tmp2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 48, i32 0) %array_vector3 = insertelement <4 x float> zeroinitializer, float %tmp2, i32 3 %array_vector5 = insertelement <4 x float> , float %tmp, i32 1 %array_vector6 = insertelement <4 x float> %array_vector5, float undef, i32 2 @@ -45,7 +45,7 @@ ret void } -declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1 declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #2 declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #3 Index: test/CodeGen/AMDGPU/sgpr-copy.ll =================================================================== --- test/CodeGen/AMDGPU/sgpr-copy.ll +++ test/CodeGen/AMDGPU/sgpr-copy.ll @@ -8,9 +8,9 @@ main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 - %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 0) - %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16) - %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 32) + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 0, i32 0) + %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 16, i32 0) + %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 32, i32 0) %tmp24 = fptosi float %tmp22 to i32 %tmp25 = icmp ne i32 %tmp24, 0 br i1 %tmp25, label %ENDIF, label %ELSE @@ -32,21 +32,21 @@ main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 - %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16) - %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 32) - %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 36) - %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 40) - %tmp25 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 48) - %tmp26 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 52) - %tmp27 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 56) - %tmp28 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 64) - %tmp29 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 68) - %tmp30 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 72) - %tmp31 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 76) - %tmp32 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 80) - %tmp33 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 84) - %tmp34 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 88) - %tmp35 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 92) + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 16, i32 0) + %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 32, i32 0) + %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 36, i32 0) + %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 40, i32 0) + %tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 48, i32 0) + %tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 52, i32 0) + %tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 56, i32 0) + %tmp28 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 64, i32 0) + %tmp29 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 68, i32 0) + %tmp30 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 72, i32 0) + %tmp31 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 76, i32 0) + %tmp32 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 80, i32 0) + %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 84, i32 0) + %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 88, i32 0) + %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 92, i32 0) %tmp36 = getelementptr <8 x i32>, <8 x i32> addrspace(4)* %arg2, i32 0 %tmp37 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp36, !tbaa !0 %tmp38 = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg1, i32 0 @@ -172,10 +172,10 @@ main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 - %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 0) - %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 4) - %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 8) - %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 12) + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 0, i32 0) + %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 4, i32 0) + %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 8, i32 0) + %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 12, i32 0) %tmp25 = fptosi float %tmp24 to i32 %tmp26 = bitcast i32 %tmp25 to float %tmp27 = bitcast float %tmp26 to i32 @@ -225,7 +225,7 @@ entry: %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0 %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 - %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 16) + %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 16, i32 0) %tmp23 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 0 %tmp24 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp23, !tbaa !0 %tmp25 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 0 @@ -325,7 +325,7 @@ bb: %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg1, i32 0, i32 0 %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !3 - %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp22, i32 16) + %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp22, i32 16, i32 0) %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(4)* %arg3, i32 0, i32 0 %tmp26 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp25, !tbaa !3 %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(4)* %arg2, i32 0, i32 0 @@ -409,7 +409,7 @@ declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 -declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/si-sgpr-spill.ll =================================================================== --- test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -28,44 +28,44 @@ main_body: %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0 %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 - %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 96) - %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 100) - %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 104) - %tmp25 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 112) - %tmp26 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 116) - %tmp27 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 120) - %tmp28 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 128) - %tmp29 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 132) - %tmp30 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 140) - %tmp31 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 144) - %tmp32 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 160) - %tmp33 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 176) - %tmp34 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 180) - %tmp35 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 184) - %tmp36 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 192) - %tmp37 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 196) - %tmp38 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 200) - %tmp39 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 208) - %tmp40 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 212) - %tmp41 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 216) - %tmp42 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 224) - %tmp43 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 240) - %tmp44 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 244) - %tmp45 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 248) - %tmp46 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 256) - %tmp47 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 272) - %tmp48 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 276) - %tmp49 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 280) - %tmp50 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 288) - %tmp51 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 292) - %tmp52 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 296) - %tmp53 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 304) - %tmp54 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 308) - %tmp55 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 312) - %tmp56 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 368) - %tmp57 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 372) - %tmp58 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 376) - %tmp59 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 384) + %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 96, i32 0) + %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 100, i32 0) + %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 104, i32 0) + %tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 112, i32 0) + %tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 116, i32 0) + %tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 120, i32 0) + %tmp28 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 128, i32 0) + %tmp29 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 132, i32 0) + %tmp30 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 140, i32 0) + %tmp31 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 144, i32 0) + %tmp32 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 160, i32 0) + %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 176, i32 0) + %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 180, i32 0) + %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 184, i32 0) + %tmp36 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 192, i32 0) + %tmp37 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 196, i32 0) + %tmp38 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 200, i32 0) + %tmp39 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 208, i32 0) + %tmp40 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 212, i32 0) + %tmp41 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 216, i32 0) + %tmp42 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 224, i32 0) + %tmp43 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 240, i32 0) + %tmp44 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 244, i32 0) + %tmp45 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 248, i32 0) + %tmp46 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 256, i32 0) + %tmp47 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 272, i32 0) + %tmp48 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 276, i32 0) + %tmp49 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 280, i32 0) + %tmp50 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 288, i32 0) + %tmp51 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 292, i32 0) + %tmp52 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 296, i32 0) + %tmp53 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 304, i32 0) + %tmp54 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 308, i32 0) + %tmp55 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 312, i32 0) + %tmp56 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 368, i32 0) + %tmp57 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 372, i32 0) + %tmp58 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 376, i32 0) + %tmp59 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 384, i32 0) %tmp60 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 0 %tmp61 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp60, !tbaa !0 %tmp62 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 0 @@ -647,109 +647,109 @@ main_body: %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0 %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 - %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 0) - %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 4) - %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 8) - %tmp25 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 12) - %tmp26 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 28) - %tmp27 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 48) - %tmp28 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 52) - %tmp29 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 56) - %tmp30 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 64) - %tmp31 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 68) - %tmp32 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 72) - %tmp33 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 76) - %tmp34 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 128) - %tmp35 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 132) - %tmp36 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 144) - %tmp37 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 148) - %tmp38 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 152) - %tmp39 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 160) - %tmp40 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 164) - %tmp41 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 168) - %tmp42 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 172) - %tmp43 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 176) - %tmp44 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 180) - %tmp45 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 184) - %tmp46 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 192) - %tmp47 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 196) - %tmp48 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 200) - %tmp49 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 208) - %tmp50 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 212) - %tmp51 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 216) - %tmp52 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 220) - %tmp53 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 236) - %tmp54 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 240) - %tmp55 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 244) - %tmp56 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 248) - %tmp57 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 252) - %tmp58 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 256) - %tmp59 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 260) - %tmp60 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 264) - %tmp61 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 268) - %tmp62 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 272) - %tmp63 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 276) - %tmp64 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 280) - %tmp65 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 284) - %tmp66 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 288) - %tmp67 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 292) - %tmp68 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 464) - %tmp69 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 468) - %tmp70 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 472) - %tmp71 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 496) - %tmp72 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 500) - %tmp73 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 504) - %tmp74 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 512) - %tmp75 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 516) - %tmp76 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 524) - %tmp77 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 532) - %tmp78 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 536) - %tmp79 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 540) - %tmp80 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 544) - %tmp81 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 548) - %tmp82 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 552) - %tmp83 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 556) - %tmp84 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 560) - %tmp85 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 564) - %tmp86 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 568) - %tmp87 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 572) - %tmp88 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 576) - %tmp89 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 580) - %tmp90 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 584) - %tmp91 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 588) - %tmp92 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 592) - %tmp93 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 596) - %tmp94 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 600) - %tmp95 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 604) - %tmp96 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 608) - %tmp97 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 612) - %tmp98 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 616) - %tmp99 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 624) - %tmp100 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 628) - %tmp101 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 632) - %tmp102 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 636) - %tmp103 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 640) - %tmp104 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 644) - %tmp105 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 648) - %tmp106 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 652) - %tmp107 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 656) - %tmp108 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 660) - %tmp109 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 664) - %tmp110 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 668) - %tmp111 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 672) - %tmp112 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 676) - %tmp113 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 680) - %tmp114 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 684) - %tmp115 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 688) - %tmp116 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 692) - %tmp117 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 696) - %tmp118 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 700) - %tmp119 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 704) - %tmp120 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 708) - %tmp121 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 712) - %tmp122 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 716) - %tmp123 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 864) - %tmp124 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 868) + %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 0, i32 0) + %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 4, i32 0) + %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 8, i32 0) + %tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 12, i32 0) + %tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 28, i32 0) + %tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 48, i32 0) + %tmp28 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 52, i32 0) + %tmp29 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 56, i32 0) + %tmp30 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 64, i32 0) + %tmp31 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 68, i32 0) + %tmp32 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 72, i32 0) + %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 76, i32 0) + %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 128, i32 0) + %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 132, i32 0) + %tmp36 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 144, i32 0) + %tmp37 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 148, i32 0) + %tmp38 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 152, i32 0) + %tmp39 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 160, i32 0) + %tmp40 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 164, i32 0) + %tmp41 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 168, i32 0) + %tmp42 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 172, i32 0) + %tmp43 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 176, i32 0) + %tmp44 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 180, i32 0) + %tmp45 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 184, i32 0) + %tmp46 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 192, i32 0) + %tmp47 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 196, i32 0) + %tmp48 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 200, i32 0) + %tmp49 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 208, i32 0) + %tmp50 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 212, i32 0) + %tmp51 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 216, i32 0) + %tmp52 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 220, i32 0) + %tmp53 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 236, i32 0) + %tmp54 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 240, i32 0) + %tmp55 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 244, i32 0) + %tmp56 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 248, i32 0) + %tmp57 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 252, i32 0) + %tmp58 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 256, i32 0) + %tmp59 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 260, i32 0) + %tmp60 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 264, i32 0) + %tmp61 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 268, i32 0) + %tmp62 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 272, i32 0) + %tmp63 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 276, i32 0) + %tmp64 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 280, i32 0) + %tmp65 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 284, i32 0) + %tmp66 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 288, i32 0) + %tmp67 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 292, i32 0) + %tmp68 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 464, i32 0) + %tmp69 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 468, i32 0) + %tmp70 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 472, i32 0) + %tmp71 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 496, i32 0) + %tmp72 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 500, i32 0) + %tmp73 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 504, i32 0) + %tmp74 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 512, i32 0) + %tmp75 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 516, i32 0) + %tmp76 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 524, i32 0) + %tmp77 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 532, i32 0) + %tmp78 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 536, i32 0) + %tmp79 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 540, i32 0) + %tmp80 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 544, i32 0) + %tmp81 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 548, i32 0) + %tmp82 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 552, i32 0) + %tmp83 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 556, i32 0) + %tmp84 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 560, i32 0) + %tmp85 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 564, i32 0) + %tmp86 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 568, i32 0) + %tmp87 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 572, i32 0) + %tmp88 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 576, i32 0) + %tmp89 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 580, i32 0) + %tmp90 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 584, i32 0) + %tmp91 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 588, i32 0) + %tmp92 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 592, i32 0) + %tmp93 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 596, i32 0) + %tmp94 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 600, i32 0) + %tmp95 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 604, i32 0) + %tmp96 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 608, i32 0) + %tmp97 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 612, i32 0) + %tmp98 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 616, i32 0) + %tmp99 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 624, i32 0) + %tmp100 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 628, i32 0) + %tmp101 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 632, i32 0) + %tmp102 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 636, i32 0) + %tmp103 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 640, i32 0) + %tmp104 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 644, i32 0) + %tmp105 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 648, i32 0) + %tmp106 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 652, i32 0) + %tmp107 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 656, i32 0) + %tmp108 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 660, i32 0) + %tmp109 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 664, i32 0) + %tmp110 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 668, i32 0) + %tmp111 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 672, i32 0) + %tmp112 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 676, i32 0) + %tmp113 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 680, i32 0) + %tmp114 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 684, i32 0) + %tmp115 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 688, i32 0) + %tmp116 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 692, i32 0) + %tmp117 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 696, i32 0) + %tmp118 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 700, i32 0) + %tmp119 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 704, i32 0) + %tmp120 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 708, i32 0) + %tmp121 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 712, i32 0) + %tmp122 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 716, i32 0) + %tmp123 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 864, i32 0) + %tmp124 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 868, i32 0) %tmp125 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 0 %tmp126 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp125, !tbaa !0 %tmp127 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 0 @@ -1683,7 +1683,7 @@ declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 -declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/si-spill-cf.ll =================================================================== --- test/CodeGen/AMDGPU/si-spill-cf.ll +++ test/CodeGen/AMDGPU/si-spill-cf.ll @@ -9,73 +9,73 @@ define amdgpu_ps void @main() #0 { main_body: - %tmp = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 16) - %tmp1 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 32) - %tmp2 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 80) - %tmp3 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 84) - %tmp4 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 88) - %tmp5 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 96) - %tmp6 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 100) - %tmp7 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 104) - %tmp8 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 112) - %tmp9 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 116) - %tmp10 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 120) - %tmp11 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 128) - %tmp12 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 132) - %tmp13 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 136) - %tmp14 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 144) - %tmp15 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 148) - %tmp16 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 152) - %tmp17 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 160) - %tmp18 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 164) - %tmp19 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 168) - %tmp20 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 176) - %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 180) - %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 184) - %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 192) - %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 196) - %tmp25 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 200) - %tmp26 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 208) - %tmp27 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 212) - %tmp28 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 216) - %tmp29 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 224) - %tmp30 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 228) - %tmp31 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 232) - %tmp32 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 240) - %tmp33 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 244) - %tmp34 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 248) - %tmp35 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 256) - %tmp36 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 260) - %tmp37 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 264) - %tmp38 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 272) - %tmp39 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 276) - %tmp40 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 280) - %tmp41 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 288) - %tmp42 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 292) - %tmp43 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 296) - %tmp44 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 304) - %tmp45 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 308) - %tmp46 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 312) - %tmp47 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 320) - %tmp48 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 324) - %tmp49 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 328) - %tmp50 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 336) - %tmp51 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 340) - %tmp52 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 344) - %tmp53 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 352) - %tmp54 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 356) - %tmp55 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 360) - %tmp56 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 368) - %tmp57 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 372) - %tmp58 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 376) - %tmp59 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 384) - %tmp60 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 388) - %tmp61 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 392) - %tmp62 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 400) - %tmp63 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 404) - %tmp64 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 408) - %tmp65 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 416) - %tmp66 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 420) + %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 16, i32 0) + %tmp1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 32, i32 0) + %tmp2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 80, i32 0) + %tmp3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 84, i32 0) + %tmp4 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 88, i32 0) + %tmp5 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 96, i32 0) + %tmp6 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 100, i32 0) + %tmp7 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 104, i32 0) + %tmp8 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 112, i32 0) + %tmp9 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 116, i32 0) + %tmp10 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 120, i32 0) + %tmp11 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 128, i32 0) + %tmp12 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 132, i32 0) + %tmp13 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 136, i32 0) + %tmp14 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 144, i32 0) + %tmp15 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 148, i32 0) + %tmp16 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 152, i32 0) + %tmp17 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 160, i32 0) + %tmp18 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 164, i32 0) + %tmp19 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 168, i32 0) + %tmp20 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 176, i32 0) + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 180, i32 0) + %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 184, i32 0) + %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 192, i32 0) + %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 196, i32 0) + %tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 200, i32 0) + %tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 208, i32 0) + %tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 212, i32 0) + %tmp28 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 216, i32 0) + %tmp29 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 224, i32 0) + %tmp30 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 228, i32 0) + %tmp31 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 232, i32 0) + %tmp32 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 240, i32 0) + %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 244, i32 0) + %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 248, i32 0) + %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 256, i32 0) + %tmp36 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 260, i32 0) + %tmp37 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 264, i32 0) + %tmp38 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 272, i32 0) + %tmp39 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 276, i32 0) + %tmp40 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 280, i32 0) + %tmp41 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 288, i32 0) + %tmp42 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 292, i32 0) + %tmp43 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 296, i32 0) + %tmp44 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 304, i32 0) + %tmp45 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 308, i32 0) + %tmp46 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 312, i32 0) + %tmp47 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 320, i32 0) + %tmp48 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 324, i32 0) + %tmp49 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 328, i32 0) + %tmp50 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 336, i32 0) + %tmp51 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 340, i32 0) + %tmp52 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 344, i32 0) + %tmp53 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 352, i32 0) + %tmp54 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 356, i32 0) + %tmp55 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 360, i32 0) + %tmp56 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 368, i32 0) + %tmp57 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 372, i32 0) + %tmp58 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 376, i32 0) + %tmp59 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 384, i32 0) + %tmp60 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 388, i32 0) + %tmp61 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 392, i32 0) + %tmp62 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 400, i32 0) + %tmp63 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 404, i32 0) + %tmp64 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 408, i32 0) + %tmp65 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 416, i32 0) + %tmp66 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 420, i32 0) br label %LOOP LOOP: ; preds = %ENDIF2795, %main_body @@ -497,7 +497,7 @@ declare float @llvm.maxnum.f32(float, float) #1 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/smrd.ll =================================================================== --- test/CodeGen/AMDGPU/smrd.ll +++ test/CodeGen/AMDGPU/smrd.ll @@ -98,7 +98,7 @@ %d1 = insertelement <4 x i32> %d0, i32 1, i32 1 %d2 = insertelement <4 x i32> %d1, i32 2, i32 2 %d3 = insertelement <4 x i32> %d2, i32 3, i32 3 - %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %d3, i32 0) + %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %d3, i32 0, i32 0) ret float %r } @@ -110,7 +110,7 @@ main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp - %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16) + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 16, i32 0) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void } @@ -126,7 +126,7 @@ main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp - %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1020) + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1020, i32 0) %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1020, i32 1) %s.buffer.float = bitcast i32 %s.buffer to float @@ -149,7 +149,7 @@ main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp - %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1024) + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1024, i32 0) %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1024, i32 0) %s.buffer.float = bitcast i32 %s.buffer to float @@ -170,7 +170,7 @@ main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp - %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1048572) + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1048572, i32 0) %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1048572, i32 0) %s.buffer.float = bitcast i32 %s.buffer to float @@ -190,7 +190,7 @@ main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp - %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1048576) + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1048576, i32 0) %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1048576, i32 0) %s.buffer.float = bitcast i32 %s.buffer to float @@ -278,7 +278,7 @@ ; GCN: s_buffer_load_dword s{{[0-9]}}, s[0:3], s4 define amdgpu_ps float @smrd_sgpr_offset(<4 x i32> inreg %desc, i32 inreg %offset) #0 { main_body: - %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %offset) + %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0) ret float %r } @@ -286,7 +286,7 @@ ; GCN: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ; define amdgpu_ps float @smrd_vgpr_offset(<4 x i32> inreg %desc, i32 %offset) #0 { main_body: - %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %offset) + %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0) ret float %r } @@ -296,7 +296,7 @@ define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 { main_body: %off = add i32 %offset, 4092 - %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %off) + %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %off, i32 0) ret float %r } @@ -308,7 +308,7 @@ define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(<4 x i32> inreg %desc, i32 %offset) #0 { main_body: %off = add i32 %offset, 4096 - %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %off) + %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %off, i32 0) ret float %r } @@ -320,12 +320,12 @@ ; VIGFX9-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1c define amdgpu_ps void @smrd_imm_merged(<4 x i32> inreg %desc) #0 { main_body: - %r1 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 4) - %r2 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 8) - %r3 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 12) - %r4 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 16) - %r5 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 28) - %r6 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 32) + %r1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 4, i32 0) + %r2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 8, i32 0) + %r3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 12, i32 0) + %r4 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 16, i32 0) + %r5 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 28, i32 0) + %r6 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 32, i32 0) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) #0 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) #0 ret void @@ -352,7 +352,7 @@ ; define amdgpu_ps float @smrd_imm_merge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 { main_body: - %idx1.f = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 0) + %idx1.f = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 0, i32 0) %idx1 = bitcast float %idx1.f to i32 %v0.x1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 0, i32 %prim) @@ -377,7 +377,7 @@ %v1 = insertelement <3 x float> %v0.tmp1, float %v0.z, i32 2 %b = extractelement <3 x float> %v1, i32 %idx1 - %c = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 4) + %c = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 4, i32 0) %res.tmp = fadd float %a, %b %res = fadd float %res.tmp, %c @@ -396,12 +396,12 @@ %a4 = add i32 %a, 16 %a5 = add i32 %a, 28 %a6 = add i32 %a, 32 - %r1 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a1) - %r2 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a2) - %r3 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a3) - %r4 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a4) - %r5 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a5) - %r6 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a6) + %r1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a1, i32 0) + %r2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a2, i32 0) + %r3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a3, i32 0) + %r4 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a4, i32 0) + %r5 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a5, i32 0) + %r6 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a6, i32 0) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) #0 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) #0 ret void @@ -428,14 +428,14 @@ .inner_loop_body: %descriptor = load <4 x i32>, <4 x i32> addrspace(4)* %descptr, align 16, !invariant.load !0 - %load1result = call float @llvm.SI.load.const.v4i32(<4 x i32> %descriptor, i32 0) + %load1result = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %descriptor, i32 0, i32 0) store float %load1result, float addrspace(1)* undef %inner_br2 = icmp uge i32 %1, 10 br i1 %inner_br2, label %.inner_loop_header, label %.outer_loop_body .outer_loop_body: %offset = shl i32 %loopctr.2, 6 - %load2result = call float @llvm.SI.load.const.v4i32(<4 x i32> %descriptor, i32 %offset) + %load2result = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %descriptor, i32 %offset, i32 0) %outer_br = fcmp ueq float %load2result, 0x0 br i1 %outer_br, label %.outer_loop_header, label %ret_block } @@ -451,7 +451,7 @@ main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp - %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 %ncoff) + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 %ncoff, i32 0) %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 %ncoff, i32 0) %s.buffer.float = bitcast i32 %s.buffer to float @@ -470,7 +470,7 @@ main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp - %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 %ncoff) + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 %ncoff, i32 0) %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 %ncoff, i32 0) %s.buffer.float = bitcast i32 %s.buffer to float @@ -489,7 +489,7 @@ main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp - %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 %ncoff) + %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 %ncoff, i32 0) %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in %s.buffer = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %tmp22, i32 %ncoff, i32 0) %s.buffer.elt = extractelement <8 x i32> %s.buffer, i32 1 @@ -581,7 +581,7 @@ %counter = phi i32 [ 0, %main_body ], [ %counter.next, %loop ] %sum = phi float [ 0.0, %main_body ], [ %sum.next, %loop ] %offset = shl i32 %counter, 2 - %v = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %offset) + %v = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0) %sum.next = fadd float %sum, %v %counter.next = add i32 %counter, 1 %cc = icmp uge i32 %counter.next, %bound @@ -607,7 +607,7 @@ %counter = phi i32 [ 0, %main_body ], [ %counter.next, %loop.a ], [ %counter.next, %loop.b ] %sum = phi float [ 0.0, %main_body ], [ %sum.next, %loop.a ], [ %sum.next.b, %loop.b ] %offset = shl i32 %counter, 2 - %v = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %offset) + %v = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0) %sum.next = fadd float %sum, %v %counter.next = add i32 %counter, 1 %cc = icmp uge i32 %counter.next, %bound @@ -644,7 +644,7 @@ endif1: ; preds = %if1, %main_body %tmp13 = extractelement <3 x i32> %arg4, i32 0 - %tmp97 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 %tmp13) + %tmp97 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 %tmp13, i32 0) ret float %tmp97 } @@ -689,10 +689,9 @@ } declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2 -declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32) declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32) Index: test/CodeGen/AMDGPU/split-smrd.ll =================================================================== --- test/CodeGen/AMDGPU/split-smrd.ll +++ test/CodeGen/AMDGPU/split-smrd.ll @@ -8,7 +8,7 @@ ; GCN: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 define amdgpu_ps void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(4)* byval %arg) #0 { bb: - %tmp = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 96) + %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 96, i32 0) %tmp1 = bitcast float %tmp to i32 br i1 undef, label %bb2, label %bb3 @@ -31,7 +31,7 @@ declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 -declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -31,9 +31,9 @@ bb: %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg1, i64 0, i64 0 %tmp11 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, align 16, !tbaa !0 - %tmp12 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp11, i32 0) - %tmp13 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp11, i32 16) - %tmp14 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp11, i32 32) + %tmp12 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp11, i32 0, i32 0) + %tmp13 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp11, i32 16, i32 0) + %tmp14 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp11, i32 32, i32 0) %tmp15 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(4)* %arg4, i64 0, i64 0 %tmp16 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp15, align 16, !tbaa !0 %tmp17 = add i32 %arg5, %arg7 @@ -488,7 +488,7 @@ declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #2 attributes #0 = { nounwind }