Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -99,6 +99,19 @@ GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">, Intrinsic<[], [], []>; +// SI+ +def int_amdgcn_buffer_load_format : + Intrinsic<[llvm_anyfloat_ty], + [llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // sgpr_offset(SGPR or 0) + llvm_i32_ty, // inst_offset(imm) + llvm_i32_ty, // vgpr_offset(VGPR or 0) + llvm_i32_ty, // vgpr_index(VGPR or 0) + llvm_i1_ty, // glc + llvm_i1_ty, // slc + llvm_i1_ty], // tfe + [IntrNoMem]>; + // On CI+ def int_amdgcn_buffer_wbinvl1_vol : GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">, Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -41,6 +41,7 @@ FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSIShrinkInstructionsPass(); +FunctionPass *createSILoadShrinkPass(); FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); @@ -56,6 +57,9 @@ void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; +void initializeSILoadShrinkPass(PassRegistry &); +extern char &SILoadShrinkPassID; + void initializeSILoadStoreOptimizerPass(PassRegistry &); extern char &SILoadStoreOptimizerID; Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -48,6 +48,7 @@ initializeSIFoldOperandsPass(*PR); initializeSIFixSGPRLiveRangesPass(*PR); initializeSIFixControlFlowLiveIntervalsPass(*PR); + initializeSILoadShrinkPass(*PR); initializeSILoadStoreOptimizerPass(*PR); } @@ -165,6 +166,7 @@ public: GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) { } + void addCodeGenPrepare() override; bool addPreISel() override; bool addInstSelector() override; void addFastRegAlloc(FunctionPass *RegAllocPass) override; @@ -265,6 +267,11 @@ // GCN Pass Setup //===----------------------------------------------------------------------===// +void GCNPassConfig::addCodeGenPrepare() { + addPass(createSILoadShrinkPass()); + AMDGPUPassConfig::addCodeGenPrepare(); +} + bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); addPass(createSinkingPass()); Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -50,6 +50,7 @@ SIInsertWaits.cpp SIInstrInfo.cpp SIISelLowering.cpp + SILoadShrink.cpp SILoadStoreOptimizer.cpp SILowerControlFlow.cpp SILowerI1Copies.cpp Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -42,8 +42,6 @@ SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; - SDValue performUCharToFloatCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSHLPtrCombine(SDNode *N, Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1999,97 +1999,6 @@ return -1; } -/// \brief Helper function for adjustWritemask -static unsigned SubIdx2Lane(unsigned Idx) { - switch (Idx) { - default: return 0; - case AMDGPU::sub0: return 0; - case AMDGPU::sub1: return 1; - case AMDGPU::sub2: return 2; - case AMDGPU::sub3: return 3; - } -} - -/// \brief Adjust the writemask of MIMG instructions -void SITargetLowering::adjustWritemask(MachineSDNode *&Node, - SelectionDAG &DAG) const { - SDNode *Users[4] = { }; - unsigned Lane = 0; - unsigned OldDmask = Node->getConstantOperandVal(0); - unsigned NewDmask = 0; - - // Try to figure out the used register components - for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); - I != E; ++I) { - - // Abort if we can't understand the usage - if (!I->isMachineOpcode() || - I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) - return; - - // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. - // Note that subregs are packed, i.e. Lane==0 is the first bit set - // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit - // set, etc. - Lane = SubIdx2Lane(I->getConstantOperandVal(1)); - - // Set which texture component corresponds to the lane. - unsigned Comp; - for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { - assert(Dmask); - Comp = countTrailingZeros(Dmask); - Dmask &= ~(1 << Comp); - } - - // Abort if we have more than one user per component - if (Users[Lane]) - return; - - Users[Lane] = *I; - NewDmask |= 1 << Comp; - } - - // Abort if there's no change - if (NewDmask == OldDmask) - return; - - // Adjust the writemask in the node - std::vector Ops; - Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); - Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end()); - Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); - - // If we only got one lane, replace it with a copy - // (if NewDmask has only one bit set...) - if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { - SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), - MVT::i32); - SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, - SDLoc(), Users[Lane]->getValueType(0), - SDValue(Node, 0), RC); - DAG.ReplaceAllUsesWith(Users[Lane], Copy); - return; - } - - // Update the users of the node with the new indices - for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { - - SDNode *User = Users[i]; - if (!User) - continue; - - SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); - DAG.UpdateNodeOperands(User, User->getOperand(0), Op); - - switch (Idx) { - default: break; - case AMDGPU::sub0: Idx = AMDGPU::sub1; break; - case AMDGPU::sub1: Idx = AMDGPU::sub2; break; - case AMDGPU::sub2: Idx = AMDGPU::sub3; break; - } - } -} - static bool isFrameIndexOp(SDValue Op) { if (Op.getOpcode() == ISD::AssertZext) Op = Op.getOperand(0); @@ -2125,8 +2034,25 @@ const SIInstrInfo *TII = static_cast(Subtarget->getInstrInfo()); - if (TII->isMIMG(Node->getMachineOpcode())) - adjustWritemask(Node, DAG); + if (TII->isMIMG(Node->getMachineOpcode())) { + unsigned NumWritten = countPopulation(Node->getConstantOperandVal(0)); + if (NumWritten == 1) { + SDValue RC = + DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), MVT::i32); + SDNode *Copy = DAG.getMachineNode( + TargetOpcode::COPY_TO_REGCLASS, SDLoc(), + Node->getValueType(0).getVectorElementType(), SDValue(Node, 0), RC); + + for (SDNode *Use : Node->uses()) { + if (Use != Copy) { + assert(Use->isMachineOpcode() && + Use->getMachineOpcode() == TargetOpcode::EXTRACT_SUBREG); + DAG.ReplaceAllUsesWith(Use, Copy); + } + } + } + return Node; + } if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG || Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) { Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -2982,6 +2982,56 @@ defm : MUBUF_Load_Dword ; +multiclass MUBUF_Load_Format { + def : Pat < + (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$soffset, imm:$offset, + 0, 0, imm:$glc, imm:$slc, imm:$tfe)), + (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), (as_i1imm $tfe)) + >; + + def : Pat < + (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$soffset, imm:$offset, + i32:$voffset, 0, imm:$glc, imm:$slc, + imm:$tfe)), + (offen $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), (as_i1imm $tfe)) + >; + + def : Pat < + (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$soffset, imm:$offset, + 0, i32:$vindex, imm:$glc, imm:$slc, + imm:$tfe)), + (idxen $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), (as_i1imm $tfe)) + >; + + def : Pat < + (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$soffset, imm:$offset, + i32:$voffset, i32:$vindex, imm:$glc, + imm:$slc, imm:$tfe)), + (bothen (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, + $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $tfe)) + >; +} + +defm : MUBUF_Load_Format ; + +defm : MUBUF_Load_Format ; + +defm : MUBUF_Load_Format ; + class MUBUFScratchStorePat : Pat < (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset)), Index: lib/Target/AMDGPU/SILoadShrink.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SILoadShrink.cpp @@ -0,0 +1,246 @@ +//===-- SILoadShrink.cpp - Shrink load intrinsics -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Target-specific intrinsics that perform loads, such as buffer.load.format.* +// and TODO (tex) +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "si-load-shrink" + +using namespace llvm; + +namespace { + +class SILoadShrink : public FunctionPass, public InstVisitor { +public: + static char ID; + + SILoadShrink() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + const char *getPassName() const override { return "SI Load Shrink"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + FunctionPass::getAnalysisUsage(AU); + } + + void visitCallInst(CallInst &C); + void visitIntrinsicInst(IntrinsicInst &I); + +private: + void adjustReturnType(IntrinsicInst &I); + void adjustWritemask(CallInst &Call, int DMaskIndex); + + bool AnyChanges; + std::vector Replaced; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SILoadShrink, DEBUG_TYPE, "SI Load Shrink", false, false) +INITIALIZE_PASS_END(SILoadShrink, DEBUG_TYPE, "SI Load Shrink", false, false) + +char SILoadShrink::ID = 0; + +FunctionPass *llvm::createSILoadShrinkPass() { return new SILoadShrink; } + +bool SILoadShrink::runOnFunction(Function &F) { + AnyChanges = false; + + visit(F); + + for (Instruction *I : Replaced) { + I->eraseFromParent(); + } + Replaced.clear(); + + return AnyChanges; +} + +void SILoadShrink::visitCallInst(CallInst &I) { + Function *Callee = I.getCalledFunction(); + if (!Callee) + return; + + // TODO Move SI intrinsics to global list so IntrinsicID works? + static const char SI_image_sample[] = "llvm.SI.image.sample."; + static const char SI_gather4[] = "llvm.SI.gather4."; + if (Callee->getName().startswith(SI_image_sample) || + Callee->getName().startswith(SI_gather4)) { + adjustWritemask(I, 3); + } +} + +void SILoadShrink::visitIntrinsicInst(IntrinsicInst &I) { + Function *Callee = I.getCalledFunction(); + + if (Callee->getIntrinsicID() == Intrinsic::amdgcn_buffer_load_format) { + adjustReturnType(I); + } +} + +void SILoadShrink::adjustReturnType(IntrinsicInst &I) { + Type *OrigType = I.getType(); + if (!OrigType->isVectorTy()) + return; + + const unsigned OrigNumWritten = OrigType->getVectorNumElements(); + uint64_t HighestIndex = 0; + + SmallVector Uses; + for (const auto &use : I.uses()) { + ExtractElementInst *EE = dyn_cast(use.getUser()); + if (!EE) { + return; + } + + ConstantInt *IndexConstant = dyn_cast(EE->getIndexOperand()); + if (!IndexConstant) { + return; + } + + Uses.push_back(EE); + + if (EE->use_empty()) + continue; + + HighestIndex = std::max(HighestIndex, IndexConstant->getZExtValue()); + if (HighestIndex + 1 >= OrigNumWritten) + return; + } + + unsigned NewNumWritten = HighestIndex + 1; + if (NewNumWritten == 3) { + NewNumWritten = 4; // TODO XYZ codegen + if (NewNumWritten >= OrigNumWritten) { + return; + } + } + + DEBUG(dbgs() << "SILoadShrink: from " << OrigNumWritten << " to " + << NewNumWritten << "\n"); + + IRBuilder<> Builder(&I); + + Type *NewTypes[1]; + if (NewNumWritten == 1) { + NewTypes[0] = Type::getFloatTy(I.getContext()); + } else { + NewTypes[0] = + VectorType::get(OrigType->getVectorElementType(), NewNumWritten); + } + + Function *NewCallee = + Intrinsic::getDeclaration(I.getModule(), I.getIntrinsicID(), NewTypes); + SmallVector Args; + for (const Use &Arg : I.arg_operands()) + Args.push_back(Arg.get()); + CallInst *NewCall = Builder.CreateCall(NewCallee, Args); + + for (ExtractElementInst *EE : Uses) { + if (!EE->use_empty()) { + if (NewNumWritten == 1) { + EE->replaceAllUsesWith(NewCall); + } else { + EE->replaceAllUsesWith( + Builder.CreateExtractElement(NewCall, EE->getIndexOperand())); + } + } + Replaced.push_back(EE); + } + + Replaced.push_back(&I); + + AnyChanges = true; +} + +void SILoadShrink::adjustWritemask(CallInst &Call, int DMaskIndex) { + ConstantInt *DMaskConstant = + dyn_cast(Call.getArgOperand(DMaskIndex)); + if (!DMaskConstant) + return; + + const unsigned OrigDMask = DMaskConstant->getZExtValue(); + const unsigned OrigNumWritten = countPopulation(OrigDMask); + + // Collect all uses, bailing out early when everything is used + const unsigned UseAllMask = (1 << OrigNumWritten) - 1; + unsigned UseMask = 0; + SmallVector, 4> Uses; + + for (const auto &use : Call.uses()) { + ExtractElementInst *EE = dyn_cast(use.getUser()); + if (!EE) { + return; + } + + ConstantInt *IndexConstant = dyn_cast(EE->getIndexOperand()); + if (!IndexConstant) { + return; + } + + if (EE->use_empty()) + continue; + + const unsigned Index = IndexConstant->getZExtValue(); + UseMask |= 1 << Index; + if (UseMask == UseAllMask) + return; + + Uses.emplace_back(Index, EE); + } + + // If not all written channels are used, compute index remapping + Type *Int32Ty = Type::getInt32Ty(Call.getContext()); + SmallVector Remapped; + unsigned NewDMask = 0; + unsigned NewIndex = 0; + unsigned Component = 0; + for (unsigned OrigIndex = 0; OrigIndex < OrigNumWritten; + ++OrigIndex, ++Component) { + while ((OrigDMask & (1 << Component)) == 0) + ++Component; + + ExtractElementInst *NewEE = nullptr; + if (UseMask & (1 << OrigIndex)) { + if (OrigIndex != NewIndex) { + errs() << " Remap " << OrigIndex << " to " << NewIndex << "\n"; + NewEE = ExtractElementInst::Create(&Call, + ConstantInt::get(Int32Ty, NewIndex)); + NewEE->insertAfter(&Call); + } + + NewDMask |= 1 << Component; + ++NewIndex; + } + + Remapped.push_back(NewEE); + } + + // Commit remapping + Call.setArgOperand(DMaskIndex, ConstantInt::get(Int32Ty, NewDMask)); + + for (const auto &OldIndexUse : Uses) { + if (Value *RemappedValue = Remapped[OldIndexUse.first]) { + OldIndexUse.second->replaceAllUsesWith(RemappedValue); + } + } + + AnyChanges = true; +} Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll @@ -0,0 +1,41 @@ +; RUN: llc -march=amdgcn -mcpu=verde -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s + +; TODO: +; - check soffset & (immediate) offset +; - check glc/slc +; - check vector versions +; - v3f32 version: how? + +; CHECK-LABEL: {{^}}main: +; CHECK: buffer_load_format_x {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 ; encoding +; CHECK: buffer_load_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen ; encoding +; CHECK: buffer_load_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; encoding +; CHECK: buffer_load_format_x {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen ; encoding +; CHECK: s_endpgm + +define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32) #0 { +main_body: + %11 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0 + %12 = load <16 x i8>, <16 x i8> addrspace(2)* %11, align 16, !tbaa !0 + %rsrc = bitcast <16 x i8> %12 to <4 x i32> + %vgpr0 = add i32 %5, %7 + %r0 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i1 0, i1 0, i1 0) + %r1 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 %5, i1 0, i1 0, i1 0) + %r2 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 %7, i32 0, i1 0, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 %7, i32 %5, i1 0, i1 0, i1 0) + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r0, float %r1, float %r2, float %r3) + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32, i1, i1, i1) #1 +declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32, i32, i1, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32, i1, i1, i1) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="1" "enable-no-nans-fp-math"="true" } +attributes #1 = { nounwind readnone } + +!0 = !{!"const", null, i32 1}