Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -803,7 +803,7 @@ def int_amdgcn_buffer_load : AMDGPUBufferLoad; def int_amdgcn_s_buffer_load : Intrinsic < - [llvm_anyint_ty], + [llvm_any_ty], [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // byte offset(SGPR/VGPR/imm) llvm_i32_ty], // cachepolicy(imm; bit 0 = glc) @@ -835,7 +835,7 @@ [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrReadMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad; @@ -847,7 +847,7 @@ llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrReadMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; @@ -859,7 +859,7 @@ llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrWriteMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore; @@ -872,7 +872,7 @@ llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrWriteMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; @@ -884,7 +884,7 @@ llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) [], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1, 0>; def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic; @@ -904,7 +904,7 @@ llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) [], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; @@ -915,7 +915,7 @@ llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) [], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1, 0>; def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic; @@ -936,7 +936,7 @@ llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) [], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; @@ -980,7 +980,7 @@ llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrReadMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -991,7 +991,7 @@ llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrWriteMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1002,7 +1002,7 @@ llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrReadMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1014,7 +1014,7 @@ llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrWriteMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -4873,12 +4873,13 @@ SmallVector Loads; unsigned NumLoads = 1; MVT LoadVT = VT.getSimpleVT(); + MVT EltVT = LoadVT.isVector() ? LoadVT.getVectorElementType() : LoadVT; + unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; + assert((EltVT == MVT::i32 || EltVT == MVT::f32) && + isPowerOf2_32(NumElts)); - assert(LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 || - LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32); - - if (VT == MVT::v8i32 || VT == MVT::v16i32) { - NumLoads = VT == MVT::v16i32 ? 4 : 2; + if (NumElts == 8 || NumElts == 16) { + NumLoads = NumElts == 16 ? 4 : 2; LoadVT = MVT::v4i32; } Index: lib/Target/AMDGPU/SMInstructions.td =================================================================== --- lib/Target/AMDGPU/SMInstructions.td +++ lib/Target/AMDGPU/SMInstructions.td @@ -751,6 +751,12 @@ defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16i32>; + +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", f32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2f32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4f32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>; } // End let AddedComplexity = 100 let OtherPredicates = [isSICI] in { Index: test/CodeGen/AMDGPU/smrd.ll =================================================================== --- test/CodeGen/AMDGPU/smrd.ll +++ test/CodeGen/AMDGPU/smrd.ll @@ -625,7 +625,6 @@ ret float %sum.next } - ; This test checks that the load after some control flow with an offset based ; on a divergent shader input is correctly recognized as divergent. This was ; reduced from an actual regression. Yes, the %unused argument matters, as @@ -649,6 +648,45 @@ ret float %tmp97 } +; GCN-LABEL: {{^}}s_buffer_load_f32: +; GCN: s_buffer_load_dword s0, s[0:3], s4 +define amdgpu_ps void @s_buffer_load_f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { + %sgpr = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + call void asm sideeffect "; use $0", "s"(float %sgpr) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_v2f32: +; GCN: s_buffer_load_dwordx2 s[0:1], s[0:3], s4 +define amdgpu_ps void @s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { + %sgpr = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %offset, i32 0) + call void asm sideeffect "; use $0", "s"(<2 x float> %sgpr) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_v4f32: +; GCN: s_buffer_load_dwordx4 s[0:3], s[0:3], s4 +define amdgpu_ps void @s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { + %sgpr = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %offset, i32 0) + call void asm sideeffect "; use $0", "s"(<4 x float> %sgpr) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_v8f32: +; GCN: s_buffer_load_dwordx8 s[0:7], s[0:3], s4 +define amdgpu_ps void @s_buffer_load_v8f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { + %sgpr = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %offset, i32 0) + call void asm sideeffect "; use $0", "s"(<8 x float> %sgpr) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_v16f32: +; GCN: s_buffer_load_dwordx16 s[0:15], s[0:3], s4 +define amdgpu_ps void @s_buffer_load_v16f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { + %sgpr = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %offset, i32 0) + call void asm sideeffect "; use $0", "s"(<16 x float> %sgpr) + ret void +} declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 @@ -660,6 +698,12 @@ declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32) declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32) +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) +declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32) +declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32) +declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32>, i32, i32) +declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32>, i32, i32) + attributes #0 = { nounwind } attributes #1 = { nounwind readnone } attributes #2 = { nounwind readnone speculatable }