Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -865,7 +865,7 @@ [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // byte offset(SGPR/imm) llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 2 = dlc) - [IntrNoMem, ImmArg<2>]>, + [IntrReadMem, ImmArg<2>]>, AMDGPURsrcIntrinsic<0>; class AMDGPUBufferStore : Intrinsic < Index: lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -928,24 +928,16 @@ applyDefaultMapping(OpdMapper); executeInWaterfallLoop(MI, MRI, { 2 }); return; - case AMDGPU::G_INTRINSIC: { - switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { - case Intrinsic::amdgcn_s_buffer_load: { - // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS - executeInWaterfallLoop(MI, MRI, { 2, 3 }); - return; - } - default: - break; - } - break; - } case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { case Intrinsic::amdgcn_buffer_load: { executeInWaterfallLoop(MI, MRI, { 2 }); return; } + case Intrinsic::amdgcn_s_buffer_load: { + executeInWaterfallLoop(MI, MRI, { 2, 3 }); + return; + } default: break; } @@ -1466,28 +1458,6 @@ = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } - case Intrinsic::amdgcn_s_buffer_load: { - // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS - unsigned RSrc = MI.getOperand(2).getReg(); // SGPR - unsigned Offset = MI.getOperand(3).getReg(); // SGPR/imm - - unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); - unsigned Size3 = MRI.getType(Offset).getSizeInBits(); - - unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); - unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); - - OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0); - OpdsMapping[1] = nullptr; // intrinsic id - - // Lie and claim everything is legal, even though some need to be - // SGPRs. applyMapping will have to deal with it as a waterfall loop. - OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc - OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3); - OpdsMapping[4] = nullptr; - break; - } case Intrinsic::amdgcn_div_scale: { unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); @@ -1573,6 +1543,28 @@ OpdsMapping[6] = nullptr; break; } + case Intrinsic::amdgcn_s_buffer_load: { + // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS + unsigned RSrc = MI.getOperand(2).getReg(); // SGPR + unsigned Offset = MI.getOperand(3).getReg(); // SGPR/imm + + unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); + unsigned Size3 = MRI.getType(Offset).getSizeInBits(); + + unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); + unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); + + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0); + OpdsMapping[1] = nullptr; // intrinsic id + + // Lie and claim everything is legal, even though some need to be + // SGPRs. applyMapping will have to deal with it as a waterfall loop. + OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc + OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3); + OpdsMapping[4] = nullptr; + break; + } } break; Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -59,8 +59,9 @@ MVT VT, unsigned Offset) const; SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG) const; - SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset, - SDValue GLC, SDValue DLC, SelectionDAG &DAG) const; + SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Chain, + SDValue Rsrc, SDValue Offset, SDValue GLC, + SDValue DLC, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -5347,8 +5347,9 @@ return SDValue(NewNode, 0); } -SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, - SDValue Offset, SDValue GLC, SDValue DLC, +SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Chain, + SDValue Rsrc, SDValue Offset, + SDValue GLC, SDValue DLC, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachineMemOperand *MMO = MF.getMachineMemOperand( @@ -5359,18 +5360,20 @@ if (!Offset->isDivergent()) { SDValue Ops[] = { + Chain, Rsrc, Offset, // Offset GLC, DLC, }; return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, - DAG.getVTList(VT), Ops, VT, MMO); + DAG.getVTList(VT, MVT::Other), Ops, VT, MMO); } // We have a divergent offset. Emit a MUBUF buffer load instead. We can // assume that the buffer is unswizzled. SmallVector Loads; + SmallVector Chains; unsigned NumLoads = 1; MVT LoadVT = VT.getSimpleVT(); unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; @@ -5383,10 +5386,10 @@ LoadVT = MVT::v4i32; } - SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue}); + SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other}); unsigned CachePolicy = cast(GLC)->getZExtValue(); SDValue Ops[] = { - DAG.getEntryNode(), // Chain + Chain, Rsrc, // rsrc DAG.getConstant(0, DL, MVT::i32), // vindex {}, // voffset @@ -5403,12 +5406,18 @@ uint64_t InstOffset = cast(Ops[5])->getZExtValue(); for (unsigned i = 0; i < NumLoads; ++i) { Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32); - Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, - Ops, LoadVT, MMO)); + SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, + Ops, LoadVT, MMO); + Loads.push_back(Load); + Chains.push_back(Load.getValue(1)); + } + + if (VT == MVT::v8i32 || VT == MVT::v16i32) { + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads); + return DAG.getMergeValues({Concat, DAG.getTokenFactor(DL, Chains)}, DL); } - if (VT == MVT::v8i32 || VT == MVT::v16i32) - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads); + assert(NumLoads == 1); return Loads[0]; } @@ -5569,16 +5578,6 @@ case Intrinsic::amdgcn_wavefrontsize: return DAG.getConstant(MF.getSubtarget().getWavefrontSize(), SDLoc(Op), MVT::i32); - case Intrinsic::amdgcn_s_buffer_load: { - bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10; - SDValue GLC; - SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1); - if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr, - IsGFX10 ? &DLC : nullptr)) - return Op; - return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), GLC, DLC, - DAG); - } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); case Intrinsic::amdgcn_interp_mov: { @@ -6359,7 +6358,19 @@ return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } + case Intrinsic::amdgcn_s_buffer_load: { + bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10; + SDValue GLC; + SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1); + if (!parseCachePolicy(Op.getOperand(4), DAG, &GLC, nullptr, + IsGFX10 ? &DLC : nullptr)) + return Op; + EVT VT = Op.getValueType(); + return lowerSBuffer(VT, DL, Op.getOperand(0), + Op.getOperand(2), Op.getOperand(3), GLC, DLC, + DAG); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -41,7 +41,7 @@ def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD", SDTypeProfile<1, 4, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>, SDTCisVT<4, i1>]>, - [SDNPMayLoad, SDNPMemOperand] + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] >; def SIds_ordered_count : SDNode<"AMDGPUISD::DS_ORDERED_COUNT", Index: test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir +++ test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir @@ -14,10 +14,10 @@ ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[INT:%[0-9]+]]:sgpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), 0 + ; CHECK: [[INT:%[0-9]+]]:sgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), 0 %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $sgpr4 - %2:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + %2:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 ... @@ -43,7 +43,7 @@ ; CHECK: [[PHI1:%[0-9]+]]:sgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %2(<4 x s32>), %bb.1 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec - ; CHECK: [[INT:%[0-9]+]]:sgpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[V_READFIRSTLANE_B32_]](s32), 0 + ; CHECK: [[INT:%[0-9]+]]:sgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[V_READFIRSTLANE_B32_]](s32), 0 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -53,7 +53,7 @@ ; CHECK: .3: %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $vgpr0 - %2:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + %2:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 ... @@ -88,7 +88,7 @@ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[INT:%[0-9]+]]:sgpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), 0 + ; CHECK: [[INT:%[0-9]+]]:sgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), 0 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -98,7 +98,7 @@ ; CHECK: .3: %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $sgpr0 - %2:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + %2:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 ... @@ -136,7 +136,7 @@ ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY1]](s32), implicit $exec ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; CHECK: [[INT:%[0-9]+]]:sgpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[V_READFIRSTLANE_B32_4]](s32), 0 + ; CHECK: [[INT:%[0-9]+]]:sgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[V_READFIRSTLANE_B32_4]](s32), 0 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -146,6 +146,6 @@ ; CHECK: .3: %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $vgpr4 - %2:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + %2:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 ...