Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2913,22 +2913,38 @@ Register Dst = MI.getOperand(0).getReg(); LLT Ty = B.getMRI()->getType(Dst); unsigned Size = Ty.getSizeInBits(); + MachineFunction &MF = B.getMF(); + + Observer.changingInstr(MI); + + // FIXME: We don't really need this intermediate instruction. The intrinsic + // should be fixed to have a memory operand. Since it's readnone, we're not + // allowed to add one. + MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); + MI.RemoveOperand(1); // Remove intrinsic ID + + // FIXME: When intrinsic definition is fixed, this should have an MMO already. + // TODO: Should this use datalayout alignment? + const unsigned MemSize = (Size + 7) / 8; + const unsigned MemAlign = 4; + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, MemSize, MemAlign); + MI.addMemOperand(MF, MMO); // There are no 96-bit result scalar loads, but widening to 128-bit should // always be legal. We may need to restore this to a 96-bit result if it turns // out this needs to be converted to a vector load during RegBankSelect. - if (isPowerOf2_32(Size)) - return true; - - LegalizerHelper Helper(B.getMF(), *this, Observer, B); - B.setInstr(MI); - - Observer.changingInstr(MI); + if (!isPowerOf2_32(Size)) { + LegalizerHelper Helper(MF, *this, Observer, B); + B.setInstr(MI); - if (Ty.isVector()) - Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); - else - Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); + if (Ty.isVector()) + Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); + else + Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); + } Observer.changedInstr(MI); return true; Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1355,9 +1355,9 @@ LLT Ty = MRI.getType(Dst); const RegisterBank *RSrcBank = - OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; + OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; const RegisterBank *OffsetBank = - OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; + OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; if (RSrcBank == &AMDGPU::SGPRRegBank && OffsetBank == &AMDGPU::SGPRRegBank) return true; // Legal mapping @@ -1383,7 +1383,7 @@ Register VOffset; int64_t ImmOffset = 0; - unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(3).getReg(), + unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(), VOffset, SOffset, ImmOffset, Align); // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we @@ -1401,7 +1401,7 @@ // If only the offset is divergent, emit a MUBUF buffer load instead. We can // assume that the buffer is unswizzled. - Register RSrc = MI.getOperand(2).getReg(); + Register RSrc = MI.getOperand(1).getReg(); Register VIndex = B.buildConstant(S32, 0).getReg(0); B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); @@ -2492,12 +2492,12 @@ executeInWaterfallLoop(MI, MRI, {3, 6}); return; } + case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { + applyMappingSBufferLoad(OpdMapper); + return; + } case AMDGPU::G_INTRINSIC: { switch (MI.getIntrinsicID()) { - case Intrinsic::amdgcn_s_buffer_load: { - applyMappingSBufferLoad(OpdMapper); - return; - } case Intrinsic::amdgcn_readlane: { substituteSimpleCopyRegs(OpdMapper, 2); @@ -3380,6 +3380,22 @@ // initialized. break; } + case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { + // Lie and claim everything is legal, even though some need to be + // SGPRs. applyMapping will have to deal with it as a waterfall loop. + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + + // We need to convert this to a MUBUF if either the resource of offset is + // VGPR. + unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); + unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); + unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); + + unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); + break; + } case AMDGPU::G_INTRINSIC: { switch (MI.getIntrinsicID()) { default: @@ -3464,24 +3480,6 @@ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); break; } - case Intrinsic::amdgcn_s_buffer_load: { - // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS - - // Lie and claim everything is legal, even though some need to be - // SGPRs. applyMapping will have to deal with it as a waterfall loop. - OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); - OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); - - // We need to convert this to a MUBUF if either the resource of offset is - // VGPR. - unsigned RSrcBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); - unsigned OffsetBank = OpdsMapping[3]->BreakDown[0].RegBank->getID(); - unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); - - unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); - break; - } case Intrinsic::amdgcn_div_scale: { unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2278,3 +2278,14 @@ let mayLoad = 1; let mayStore = 1; } + +// Wrapper around llvm.amdgcn.s.buffer.load. This is mostly needed as +// a workaround for the intrinsic being defined as readnone, but +// really needs a memory operand. +def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy); + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 0; +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir @@ -2,6 +2,24 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass=legalizer %s -o - | FileCheck -check-prefix=GCN %s +--- +name: s_buffer_load_s32 +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GCN-LABEL: name: s_buffer_load_s32 + ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 4) + ; GCN: S_ENDPGM 0, implicit [[AMDGPU_S_BUFFER_LOAD]](s32) + %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + S_ENDPGM 0, implicit %2 + +... + --- name: s_buffer_load_v3s32 body: | @@ -11,8 +29,8 @@ ; GCN-LABEL: name: s_buffer_load_v3s32 ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN: [[INT:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[C]](s32), 0 - ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[INT]](<4 x s32>), 0 + ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4) + ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), 0 ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<3 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 @@ -30,8 +48,8 @@ ; GCN-LABEL: name: s_buffer_load_v3p3 ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN: [[INT:%[0-9]+]]:_(<4 x p3>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[C]](s32), 0 - ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x p3>) = G_EXTRACT [[INT]](<4 x p3>), 0 + ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x p3>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4) + ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x p3>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x p3>), 0 ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<3 x p3>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 @@ -49,8 +67,8 @@ ; GCN-LABEL: name: s_buffer_load_v6s16 ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[C]](s32), 0 - ; GCN: [[EXTRACT:%[0-9]+]]:_(<6 x s16>) = G_EXTRACT [[INT]](<8 x s16>), 0 + ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<8 x s16>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4) + ; GCN: [[EXTRACT:%[0-9]+]]:_(<6 x s16>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<8 x s16>), 0 ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<6 x s16>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 @@ -68,8 +86,8 @@ ; GCN-LABEL: name: s_buffer_load_v6s32 ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN: [[INT:%[0-9]+]]:_(<8 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[C]](s32), 0 - ; GCN: [[EXTRACT:%[0-9]+]]:_(<6 x s32>) = G_EXTRACT [[INT]](<8 x s32>), 0 + ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 24, align 4) + ; GCN: [[EXTRACT:%[0-9]+]]:_(<6 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>), 0 ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<6 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 @@ -87,8 +105,8 @@ ; GCN-LABEL: name: s_buffer_load_v3s64 ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN: [[INT:%[0-9]+]]:_(<4 x s64>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[C]](s32), 0 - ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s64>) = G_EXTRACT [[INT]](<4 x s64>), 0 + ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s64>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 24, align 4) + ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s64>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s64>), 0 ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<3 x s64>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 @@ -106,8 +124,8 @@ ; GCN-LABEL: name: s_buffer_load_v12s8 ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN: [[INT:%[0-9]+]]:_(<16 x s8>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[C]](s32), 0 - ; GCN: [[EXTRACT:%[0-9]+]]:_(<12 x s8>) = G_EXTRACT [[INT]](<16 x s8>), 0 + ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<16 x s8>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4) + ; GCN: [[EXTRACT:%[0-9]+]]:_(<12 x s8>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<16 x s8>), 0 ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<12 x s8>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 @@ -125,8 +143,8 @@ ; GCN-LABEL: name: s_buffer_load_s96 ; GCN: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN: [[INT:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[C]](s32), 0 - ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[INT]](<4 x s32>), 0 + ; GCN: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load 12, align 4) + ; GCN: [[EXTRACT:%[0-9]+]]:_(<3 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), 0 ; GCN: S_ENDPGM 0, implicit [[EXTRACT]](<3 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s -# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s +# XUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s --- name: buffer_load_ss @@ -14,10 +14,10 @@ ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[INT:%[0-9]+]]:sgpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), 0 + ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[COPY1]](s32), 0 %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $sgpr4 - %2:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0 ... @@ -38,7 +38,7 @@ ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C1]](s32), [[COPY1]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $vgpr0 - %2:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0 ... @@ -86,7 +86,7 @@ ; CHECK: .3: %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $sgpr0 - %2:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0 ... @@ -133,6 +133,6 @@ ; CHECK: .3: %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $vgpr4 - %2:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 + %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0 ... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -13,8 +13,8 @@ ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 - ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 4) + ; CHECK: $sgpr0 = COPY [[AMDGPU_S_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret i32 %val @@ -30,8 +30,8 @@ ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK: [[INT:%[0-9]+]]:sgpr(<2 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 - ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 8, align 4) + ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>) ; CHECK: $sgpr0 = COPY [[UV]](s32) ; CHECK: $sgpr1 = COPY [[UV1]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 @@ -49,8 +49,8 @@ ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK: [[INT:%[0-9]+]]:sgpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 - ; CHECK: [[EXTRACT:%[0-9]+]]:sgpr(<3 x s32>) = G_EXTRACT [[INT]](<4 x s32>), 0 + ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 12, align 4) + ; CHECK: [[EXTRACT:%[0-9]+]]:sgpr(<3 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), 0 ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s32>) ; CHECK: $sgpr0 = COPY [[UV]](s32) ; CHECK: $sgpr1 = COPY [[UV1]](s32) @@ -70,8 +70,8 @@ ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK: [[INT:%[0-9]+]]:sgpr(<8 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 - ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[INT]](<8 x s32>) + ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 32, align 4) + ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>) ; CHECK: $sgpr0 = COPY [[UV]](s32) ; CHECK: $sgpr1 = COPY [[UV1]](s32) ; CHECK: $sgpr2 = COPY [[UV2]](s32) @@ -95,8 +95,8 @@ ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; CHECK: [[INT:%[0-9]+]]:sgpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 - ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[INT]](<16 x s32>) + ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 64, align 4) + ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>) ; CHECK: $sgpr0 = COPY [[UV]](s32) ; CHECK: $sgpr1 = COPY [[UV1]](s32) ; CHECK: $sgpr2 = COPY [[UV2]](s32) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir @@ -42,7 +42,7 @@ %1:_(s32) = COPY $sgpr0 %2:vgpr(s32) = G_CONSTANT i32 256 %3:_(s32) = G_ADD %1, %2 - %4:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %3, 0 + %4:_(s32) = G_AMDGPU_S_BUFFER_LOAD %0, %3, 0 S_ENDPGM 0, implicit %4 ...