diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1458,6 +1458,23 @@ [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg>, ImmArg>, ImmArg>]>; +// __int_amdgcn_lds_direct_load +// The input argument is m0, which contains a packed combination of address +// offset and flags describing the data type. +def int_amdgcn_lds_direct_load : + Intrinsic<[llvm_any_ty], // overloaded for types u8, u16, i32/f32, i8, i16 + [llvm_i32_ty], + [IntrReadMem, IntrSpeculatable, IntrWillReturn]>; + +// __int_amdgcn_lds_param_load , , +// Like interp intrinsics, this reads from lds, but the memory values are constant, +// so it behaves like IntrNoMem. +def int_amdgcn_lds_param_load : + Intrinsic<[llvm_float_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable, IntrWillReturn, + ImmArg>, ImmArg>]>; + // Deprecated: use llvm.amdgcn.live.mask instead. def int_amdgcn_ps_live : Intrinsic < [llvm_i1_ty], diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3008,7 +3008,8 @@ case Intrinsic::amdgcn_interp_p2: case Intrinsic::amdgcn_interp_mov: case Intrinsic::amdgcn_interp_p1_f16: - case Intrinsic::amdgcn_interp_p2_f16: { + case Intrinsic::amdgcn_interp_p2_f16: + case Intrinsic::amdgcn_lds_param_load: { applyDefaultMapping(OpdMapper); // Readlane for m0 value, which is always the last operand. @@ -3116,6 +3117,12 @@ constrainOpWithReadfirstlane(MI, MRI, 2); return; } + case Intrinsic::amdgcn_lds_direct_load: { + applyDefaultMapping(OpdMapper); + // Readlane for m0 value, which is always the last operand. + constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index + return; + } default: { if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID)) { @@ -4437,7 +4444,8 @@ case Intrinsic::amdgcn_interp_p2: case Intrinsic::amdgcn_interp_mov: case Intrinsic::amdgcn_interp_p1_f16: - case Intrinsic::amdgcn_interp_p2_f16: { + case Intrinsic::amdgcn_interp_p2_f16: + case Intrinsic::amdgcn_lds_param_load: { const int M0Idx = MI.getNumOperands() - 1; Register M0Reg = MI.getOperand(M0Idx).getReg(); unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); @@ -4660,6 +4668,21 @@ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_lds_direct_load: { + const int M0Idx = MI.getNumOperands() - 1; + Register M0Reg = MI.getOperand(M0Idx).getReg(); + unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); + for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) + OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + + // Must be SGPR, but we must take whatever the original bank is and fix it + // later. + OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); + break; + } default: return getInvalidInstructionMapping(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -230,6 +230,8 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td b/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td --- a/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td +++ b/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td @@ -91,6 +91,16 @@ def LDS_DIRECT_LOAD : LDSDIR_Pseudo<"lds_direct_load", 1>; def LDS_PARAM_LOAD : LDSDIR_Pseudo<"lds_param_load", 0>; +def : GCNPat < + (f32 (int_amdgcn_lds_direct_load M0)), + (LDS_DIRECT_LOAD 0) +>; + +def : GCNPat < + (f32 (int_amdgcn_lds_param_load timm:$attrchan, timm:$attr, M0)), + (LDS_PARAM_LOAD timm:$attr, timm:$attrchan, 0) +>; + //===----------------------------------------------------------------------===// // GFX11+ //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.direct.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.direct.load.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.direct.load.mir @@ -0,0 +1,36 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: lds_direct_load_s +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + ; CHECK-LABEL: name: lds_direct_load_s + ; CHECK: liveins: $sgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.lds.direct.load), [[COPY]](s32) + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.lds.direct.load), %0 +... + +--- +name: lds_direct_load_v +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: lds_direct_load_v + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.lds.direct.load), [[V_READFIRSTLANE_B32_]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.lds.direct.load), %0 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.param.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.param.load.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.lds.param.load.mir @@ -0,0 +1,36 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: lds_param_load_s +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + ; CHECK-LABEL: name: lds_param_load_s + ; CHECK: liveins: $sgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.lds.param.load), 1, 1, [[COPY]](s32) + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.lds.param.load), 1, 1, %0 +... + +--- +name: lds_param_load_v +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: lds_param_load_v + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.lds.param.load), 1, 1, [[V_READFIRSTLANE_B32_]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.lds.param.load), 1, 1, %0 +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll @@ -0,0 +1,40 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s + +; GFX11-LABEL: {{^}}lds_direct_load: +; GFX11: s_mov_b32 m0 +; GFX11: lds_direct_load v{{[0-9]+}} +; GFX11: s_mov_b32 m0 +; GFX11: lds_direct_load v{{[0-9]+}} +; GFX11: s_mov_b32 m0 +; GFX11: lds_direct_load v{{[0-9]+}} +; GFX11: v_add_f32 +; GFX11: buffer_store_b32 +; GFX11: buffer_store_b32 +; GFX11: buffer_store_b32 +; GFX11: buffer_store_b32 +; GFX11: buffer_store_b32 +; GFX11: buffer_store_b32 +define amdgpu_ps void @lds_direct_load(<4 x i32> inreg %buf, i32 inreg %arg0, + i32 inreg %arg1, i32 inreg %arg2) #0 { +main_body: + %p0 = call float @llvm.amdgcn.lds.direct.load(i32 %arg0) + ; Ensure memory clustering is occuring for lds_direct_load + %p5 = fadd float %p0, 1.0 + %p1 = call float @llvm.amdgcn.lds.direct.load(i32 %arg1) + %p2 = call float @llvm.amdgcn.lds.direct.load(i32 %arg2) + %p3 = call float @llvm.amdgcn.lds.direct.load(i32 %arg1) + %p4 = call float @llvm.amdgcn.lds.direct.load(i32 %arg2) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p5, <4 x i32> %buf, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p1, <4 x i32> %buf, i32 4, i32 1, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p2, <4 x i32> %buf, i32 4, i32 2, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p3, <4 x i32> %buf, i32 4, i32 3, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p4, <4 x i32> %buf, i32 4, i32 4, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p0, <4 x i32> %buf, i32 4, i32 5, i32 0) + ret void +} + +declare float @llvm.amdgcn.lds.direct.load(i32) #1 +declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s + +; GFX11-LABEL: {{^}}lds_param_load: +; GFX11: s_mov_b32 m0 +; GFX11-DAG: lds_param_load v{{[0-9]+}}, attr0.x +; GFX11-DAG: lds_param_load v{{[0-9]+}}, attr0.y +; GFX11-DAG: lds_param_load v{{[0-9]+}}, attr0.z +; GFX11-DAG: lds_param_load v{{[0-9]+}}, attr0.w +; GFX11-DAG: lds_param_load v{{[0-9]+}}, attr1.x +; GFX11: v_add_f32 +; GFX11: buffer_store_b32 +; GFX11: buffer_store_b32 +; GFX11: buffer_store_b32 +; GFX11: buffer_store_b32 +; GFX11: buffer_store_b32 +; GFX11: buffer_store_b32 +define amdgpu_ps void @lds_param_load(<4 x i32> inreg %buf, i32 inreg %arg) #0 { +main_body: + %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %arg) + ; Ensure memory clustering is occuring for lds_param_load + %p5 = fadd float %p0, 1.0 + %p1 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %arg) + %p2 = call float @llvm.amdgcn.lds.param.load(i32 2, i32 0, i32 %arg) + %p3 = call float @llvm.amdgcn.lds.param.load(i32 3, i32 0, i32 %arg) + %p4 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %arg) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p5, <4 x i32> %buf, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p1, <4 x i32> %buf, i32 4, i32 1, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p2, <4 x i32> %buf, i32 4, i32 2, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p3, <4 x i32> %buf, i32 4, i32 3, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p4, <4 x i32> %buf, i32 4, i32 4, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %p0, <4 x i32> %buf, i32 4, i32 5, i32 0) + ret void +} + +declare float @llvm.amdgcn.lds.param.load(i32, i32, i32) #1 +declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone }