Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -242,6 +242,12 @@ "Support DPP (Data Parallel Primitives) extension" >; +def FeatureR128A16 : SubtargetFeature<"r128-a16", + "HasR128A16", + "true", + "Support 16 bit coordindates/gradients/lod/clamp/mip types on gfx9" +>; + def FeatureIntClamp : SubtargetFeature<"int-clamp-insts", "HasIntClamp", "true", @@ -444,7 +450,7 @@ FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, - FeatureAddNoCarryInsts, FeatureScalarAtomics + FeatureAddNoCarryInsts, FeatureScalarAtomics, FeatureR128A16 ] >; @@ -703,6 +709,9 @@ def HasDPP : Predicate<"Subtarget->hasDPP()">, AssemblerPredicate<"FeatureDPP">; +def HasR128A16 : Predicate<"Subtarget->hasR128A16()">, + AssemblerPredicate<"FeatureR128A16">; + def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">, AssemblerPredicate<"FeatureIntClamp">; Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -354,6 +354,7 @@ bool HasSDWAMac; bool HasSDWAOutModsVOPC; bool HasDPP; + bool HasR128A16; bool HasDLInsts; bool D16PreservesUnusedBits; bool FlatAddressSpace; @@ -790,6 +791,10 @@ return HasDPP; } + bool hasR128A16() const { + return HasR128A16; + } + bool enableSIScheduler() const { return EnableSIScheduler; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -197,6 +197,7 @@ HasSDWAMac(false), HasSDWAOutModsVOPC(false), HasDPP(false), + HasR128A16(false), HasDLInsts(false), D16PreservesUnusedBits(false), FlatAddressSpace(false), Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -156,7 +156,7 @@ ImmTyDMask, ImmTyUNorm, ImmTyDA, - ImmTyR128, + ImmTyR128A16, ImmTyLWE, ImmTyExpTgt, ImmTyExpCompr, @@ -291,7 +291,7 @@ bool isDMask() const { return isImmTy(ImmTyDMask); } bool isUNorm() const { return isImmTy(ImmTyUNorm); } bool isDA() const { return isImmTy(ImmTyDA); } - bool isR128() const { return isImmTy(ImmTyR128); } + bool isR128A16() const { return isImmTy(ImmTyR128A16); } bool isLWE() const { return isImmTy(ImmTyLWE); } bool isOff() const { return isImmTy(ImmTyOff); } bool isExpTgt() const { return isImmTy(ImmTyExpTgt); } @@ -681,7 +681,7 @@ case ImmTyDMask: OS << "DMask"; break; case ImmTyUNorm: OS << "UNorm"; break; case ImmTyDA: OS << "DA"; break; - case ImmTyR128: OS << "R128"; break; + case ImmTyR128A16: OS << "R128A16"; break; case ImmTyLWE: OS << "LWE"; break; case ImmTyOff: OS << "Off"; break; case ImmTyExpTgt: OS << "ExpTgt"; break; @@ -1092,7 +1092,6 @@ bool validateMIMGAtomicDMask(const MCInst &Inst); bool validateMIMGGatherDMask(const MCInst &Inst); bool validateMIMGDataSize(const MCInst &Inst); - bool validateMIMGR128(const MCInst &Inst); bool validateMIMGD16(const MCInst &Inst); bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; @@ -2447,22 +2446,6 @@ return DMask == 0x1 || DMask == 0x2 || DMask == 0x4 || DMask == 0x8; } -bool AMDGPUAsmParser::validateMIMGR128(const MCInst &Inst) { - - const unsigned Opc = Inst.getOpcode(); - const MCInstrDesc &Desc = MII.get(Opc); - - if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0) - return true; - - int Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::r128); - assert(Idx != -1); - - bool R128 = (Inst.getOperand(Idx).getImm() != 0); - - return !R128 || hasMIMG_R128(); -} - bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) { const unsigned Opc = Inst.getOpcode(); @@ -2497,11 +2480,6 @@ "integer clamping is not supported on this GPU"); return false; } - if (!validateMIMGR128(Inst)) { - Error(IDLoc, - "r128 modifier is not supported on this GPU"); - return false; - } // For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate. if (!validateMIMGD16(Inst)) { Error(IDLoc, @@ -3465,6 +3443,10 @@ case AsmToken::Identifier: { StringRef Tok = Parser.getTok().getString(); if (Tok == Name) { + if (Tok == "r128" && isGFX9()) + Error(S, "r128 modifier is not supported on this GPU"); + if (Tok == "a16" && !isGFX9()) + Error(S, "a16 modifier is not supported on this GPU"); Bit = 1; Parser.Lex(); } else if (Tok.startswith("no") && Tok.endswith(Name)) { @@ -4661,7 +4643,7 @@ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA); @@ -4772,7 +4754,8 @@ {"omod", AMDGPUOperand::ImmTyOModSI, false, ConvertOmodMul}, {"unorm", AMDGPUOperand::ImmTyUNorm, true, nullptr}, {"da", AMDGPUOperand::ImmTyDA, true, nullptr}, - {"r128", AMDGPUOperand::ImmTyR128, true, nullptr}, + {"r128", AMDGPUOperand::ImmTyR128A16, true, nullptr}, + {"a16", AMDGPUOperand::ImmTyR128A16, true, nullptr}, {"lwe", AMDGPUOperand::ImmTyLWE, true, nullptr}, {"d16", AMDGPUOperand::ImmTyD16, true, nullptr}, {"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr}, Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h =================================================================== --- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -80,7 +80,7 @@ raw_ostream &O); void printDA(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printR128(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + void printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printLWE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp =================================================================== --- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -207,9 +207,12 @@ printNamedBit(MI, OpNo, O, "da"); } -void AMDGPUInstPrinter::printR128(const MCInst *MI, unsigned OpNo, +void AMDGPUInstPrinter::printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "r128"); + if (STI.hasFeature(AMDGPU::FeatureR128A16)) + printNamedBit(MI, OpNo, O, "a16"); + else + printNamedBit(MI, OpNo, O, "r128"); } void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo, Index: lib/Target/AMDGPU/MIMGInstructions.td =================================================================== --- lib/Target/AMDGPU/MIMGInstructions.td +++ lib/Target/AMDGPU/MIMGInstructions.td @@ -141,7 +141,7 @@ let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc, DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, - R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da), + R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" #!if(BaseOpcode.HasD16, "$d16", ""); @@ -199,7 +199,7 @@ let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, - R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da), + R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" #!if(BaseOpcode.HasD16, "$d16", ""); @@ -252,7 +252,7 @@ let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, - R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da); + R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da); let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"; } @@ -316,7 +316,7 @@ let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, - R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da), + R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da" #!if(BaseOpcode.HasD16, "$d16", ""); Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -15,8 +15,8 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H #define LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H -#include "AMDGPUISelLowering.h" #include "AMDGPUArgumentUsageInfo.h" +#include "AMDGPUISelLowering.h" #include "SIInstrInfo.h" namespace llvm { @@ -26,16 +26,17 @@ const GCNSubtarget *Subtarget; public: - MVT getRegisterTypeForCallingConv(LLVMContext &Context, - CallingConv::ID CC, + MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override; unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override; - unsigned getVectorTypeBreakdownForCallingConv( - LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, - unsigned &NumIntermediates, MVT &RegisterVT) const override; + unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, + CallingConv::ID CC, EVT VT, + EVT &IntermediateVT, + unsigned &NumIntermediates, + MVT &RegisterVT) const override; private: SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, @@ -49,15 +50,14 @@ SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, const SDLoc &SL, SDValue Chain, const ISD::InputArg &Arg) const; - SDValue getPreloadedValue(SelectionDAG &DAG, - const SIMachineFunctionInfo &MFI, + SDValue getPreloadedValue(SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT, AMDGPUFunctionArgInfo::PreloadedValue) const; SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; - SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, - MVT VT, unsigned Offset) const; + SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, MVT VT, + unsigned Offset) const; SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG) const; @@ -80,22 +80,20 @@ SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M, - SelectionDAG &DAG, ArrayRef Ops, + SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M, SelectionDAG &DAG, + ArrayRef Ops, bool IsIntrinsic = false) const; SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const; /// Converts \p Op, which must be of floating point type, to the /// floating point type \p VT, by either extending or truncating it. - SDValue getFPExtOrFPTrunc(SelectionDAG &DAG, - SDValue Op, - const SDLoc &DL, + SDValue getFPExtOrFPTrunc(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, EVT VT) const; - SDValue convertArgType( - SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val, - bool Signed, const ISD::InputArg *Arg = nullptr) const; + SDValue convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, + SDValue Val, bool Signed, + const ISD::InputArg *Arg = nullptr) const; /// Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; @@ -112,11 +110,8 @@ SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; - SDValue performUCharToFloatCombine(SDNode *N, - DAGCombinerInfo &DCI) const; - SDValue performSHLPtrCombine(SDNode *N, - unsigned AS, - EVT MemVT, + SDValue performUCharToFloatCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSHLPtrCombine(SDNode *N, unsigned AS, EVT MemVT, DAGCombinerInfo &DCI) const; SDValue performMemSDNodeCombine(MemSDNode *N, DAGCombinerInfo &DCI) const; @@ -144,8 +139,8 @@ SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performBuildVectorCombine(SDNode *N, DAGCombinerInfo &DCI) const; - unsigned getFusedOpcode(const SelectionDAG &DAG, - const SDNode *N0, const SDNode *N1) const; + unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, + const SDNode *N1) const; SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; @@ -191,8 +186,8 @@ unsigned IntrinsicID) const override; bool getAddrModeArguments(IntrinsicInst * /*I*/, - SmallVectorImpl &/*Ops*/, - Type *&/*AccessTy*/) const override; + SmallVectorImpl & /*Ops*/, + Type *& /*AccessTy*/) const override; bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, @@ -201,14 +196,11 @@ bool canMergeStoresTo(unsigned AS, EVT MemVT, const SelectionDAG &DAG) const override; - bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, - unsigned Align, + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, bool *IsFast) const override; - EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, - unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, - bool MemcpyStrSrc, + EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, + bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const override; bool isMemOpUniform(const SDNode *N) const; @@ -220,7 +212,7 @@ getPreferredVectorAction(EVT VT) const override; bool shouldConvertConstantLoadToIntImm(const APInt &Imm, - Type *Ty) const override; + Type *Ty) const override; bool isTypeDesirableForOp(unsigned Op, EVT VT) const override; @@ -229,8 +221,8 @@ bool supportSplitCSR(MachineFunction *MF) const override; void initializeSplitCSR(MachineBasicBlock *Entry) const override; void insertCopiesSplitCSR( - MachineBasicBlock *Entry, - const SmallVectorImpl &Exits) const override; + MachineBasicBlock *Entry, + const SmallVectorImpl &Exits) const override; SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -238,8 +230,8 @@ const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const override; - bool CanLowerReturn(CallingConv::ID CallConv, - MachineFunction &MF, bool isVarArg, + bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, + bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const override; @@ -248,13 +240,11 @@ const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; - void passSpecialInputs( - CallLoweringInfo &CLI, - const SIMachineFunctionInfo &Info, - SmallVectorImpl> &RegsToPass, - SmallVectorImpl &MemOpChains, - SDValue Chain, - SDValue StackPtr) const; + void + passSpecialInputs(CallLoweringInfo &CLI, const SIMachineFunctionInfo &Info, + SmallVectorImpl> &RegsToPass, + SmallVectorImpl &MemOpChains, SDValue Chain, + SDValue StackPtr) const; SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, @@ -266,15 +256,15 @@ bool mayBeEmittedAsTailCall(const CallInst *) const override; bool isEligibleForTailCallOptimization( - SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - const SmallVectorImpl &Ins, SelectionDAG &DAG) const; + SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SmallVectorImpl &Ins, SelectionDAG &DAG) const; SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const override; - unsigned getRegisterByName(const char* RegName, EVT VT, + unsigned getRegisterByName(const char *RegName, EVT VT, SelectionDAG &DAG) const override; MachineBasicBlock *splitKillBlock(MachineInstr &MI, @@ -317,14 +307,13 @@ void finalizeLowering(MachineFunction &MF) const override; - void computeKnownBitsForFrameIndex(const SDValue Op, - KnownBits &Known, + void computeKnownBitsForFrameIndex(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth = 0) const override; - bool isSDNodeSourceOfDivergence(const SDNode *N, - FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const override; + bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, + DivergenceAnalysis *DA) const override; bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth = 5) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -22,11 +22,11 @@ #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" @@ -91,15 +91,14 @@ STATISTIC(NumTailCalls, "Number of tail calls"); static cl::opt EnableVGPRIndexMode( - "amdgpu-vgpr-index-mode", - cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), - cl::init(false)); + "amdgpu-vgpr-index-mode", + cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), + cl::init(false)); static cl::opt AssumeFrameIndexHighZeroBits( - "amdgpu-frame-index-zero-bits", - cl::desc("High bits of frame index assumed to be zero"), - cl::init(5), - cl::ReallyHidden); + "amdgpu-frame-index-zero-bits", + cl::desc("High bits of frame index assumed to be zero"), cl::init(5), + cl::ReallyHidden); static unsigned findFirstFreeSGPR(CCState &CCInfo) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); @@ -113,8 +112,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, const GCNSubtarget &STI) - : AMDGPUTargetLowering(TM, STI), - Subtarget(&STI) { + : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) { addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); @@ -243,8 +241,8 @@ // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. - for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, - MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16 }) { + for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, + MVT::v2f64, MVT::v4i16, MVT::v4f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -274,7 +272,7 @@ // Most operations are naturally 32-bit vector operations. We only support // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. - for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { + for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) { setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); @@ -616,7 +614,7 @@ setOperationAction(ISD::FABS, MVT::v2f16, Custom); } - for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) { + for (MVT VT : {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8}) { setOperationAction(ISD::SELECT, VT, Custom); } @@ -673,9 +671,7 @@ setHasFloatingPointExceptions(Subtarget->hasFPExceptions()); } -const GCNSubtarget *SITargetLowering::getSubtarget() const { - return Subtarget; -} +const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; } //===----------------------------------------------------------------------===// // TargetLowering queries @@ -685,8 +681,8 @@ // // There is only one special case when denormals are enabled we don't currently, // where this is OK to use. -bool SITargetLowering::isFPExtFoldable(unsigned Opcode, - EVT DestVT, EVT SrcVT) const { +bool SITargetLowering::isFPExtFoldable(unsigned Opcode, EVT DestVT, + EVT SrcVT) const { return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() && @@ -712,8 +708,7 @@ if (Size == 64) return MVT::i32; - if (Size == 16 && - Subtarget->has16BitInsts() && + if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(VT.getVectorNumElements())) return VT.isInteger() ? MVT::v2i16 : MVT::v2f16; } @@ -744,9 +739,8 @@ } unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( - LLVMContext &Context, CallingConv::ID CC, - EVT VT, EVT &IntermediateVT, - unsigned &NumIntermediates, MVT &RegisterVT) const { + LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, + unsigned &NumIntermediates, MVT &RegisterVT) const { if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { unsigned NumElts = VT.getVectorNumElements(); EVT ScalarVT = VT.getScalarType(); @@ -777,7 +771,7 @@ } return TargetLowering::getVectorTypeBreakdownForCallingConv( - Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); + Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); } bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, @@ -786,22 +780,22 @@ unsigned IntrID) const { if (const AMDGPU::RsrcIntrinsic *RsrcIntr = AMDGPU::lookupRsrcIntrinsic(IntrID)) { - AttributeList Attr = Intrinsic::getAttributes(CI.getContext(), - (Intrinsic::ID)IntrID); + AttributeList Attr = + Intrinsic::getAttributes(CI.getContext(), (Intrinsic::ID)IntrID); if (Attr.hasFnAttribute(Attribute::ReadNone)) return false; SIMachineFunctionInfo *MFI = MF.getInfo(); if (RsrcIntr->IsImage) { - Info.ptrVal = MFI->getImagePSV( - *MF.getSubtarget().getInstrInfo(), - CI.getArgOperand(RsrcIntr->RsrcArg)); + Info.ptrVal = + MFI->getImagePSV(*MF.getSubtarget().getInstrInfo(), + CI.getArgOperand(RsrcIntr->RsrcArg)); Info.align = 0; } else { - Info.ptrVal = MFI->getBufferPSV( - *MF.getSubtarget().getInstrInfo(), - CI.getArgOperand(RsrcIntr->RsrcArg)); + Info.ptrVal = + MFI->getBufferPSV(*MF.getSubtarget().getInstrInfo(), + CI.getArgOperand(RsrcIntr->RsrcArg)); } Info.flags = MachineMemOperand::MODereferenceable; @@ -817,8 +811,7 @@ // Atomic Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); - Info.flags = MachineMemOperand::MOLoad | - MachineMemOperand::MOStore | + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore | MachineMemOperand::MODereferenceable; // XXX - Should this be volatile without known ordering? @@ -852,7 +845,7 @@ } bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, - SmallVectorImpl &Ops, + SmallVectorImpl &Ops, Type *&AccessTy) const { switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: @@ -889,15 +882,15 @@ return isInt<13>(AM.BaseOffs) && AM.Scale == 0; if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) { - // Assume the we will use FLAT for all global memory accesses - // on VI. - // FIXME: This assumption is currently wrong. On VI we still use - // MUBUF instructions for the r + i addressing mode. As currently - // implemented, the MUBUF instructions only work on buffer < 4GB. - // It may be possible to support > 4GB buffers with MUBUF instructions, - // by setting the stride value in the resource descriptor which would - // increase the size limit to (stride * 4GB). However, this is risky, - // because it has never been validated. + // Assume the we will use FLAT for all global memory accesses + // on VI. + // FIXME: This assumption is currently wrong. On VI we still use + // MUBUF instructions for the r + i addressing mode. As currently + // implemented, the MUBUF instructions only work on buffer < 4GB. + // It may be possible to support > 4GB buffers with MUBUF instructions, + // by setting the stride value in the resource descriptor which would + // increase the size limit to (stride * 4GB). However, this is risky, + // because it has never been validated. return isLegalFlatAddressingMode(AM); } @@ -941,7 +934,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, - unsigned AS, Instruction *I) const { + unsigned AS, + Instruction *I) const { // No global is ever allowed as a base. if (AM.BaseGV) return false; @@ -973,7 +967,8 @@ // in 8-bits, it can use a smaller encoding. if (!isUInt<32>(AM.BaseOffs / 4)) return false; - } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + } else if (Subtarget->getGeneration() >= + AMDGPUSubtarget::VOLCANIC_ISLANDS) { // On VI, these use the SMEM format and the offset is 20-bit in bytes. if (!isUInt<20>(AM.BaseOffs)) return false; @@ -990,8 +985,7 @@ } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) { return isLegalMUBUFAddressingMode(AM); - } else if (AS == AMDGPUASI.LOCAL_ADDRESS || - AS == AMDGPUASI.REGION_ADDRESS) { + } else if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.REGION_ADDRESS) { // Basic, single offset DS instructions allow a 16-bit unsigned immediate // field. // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have @@ -1074,8 +1068,9 @@ // buffer instruction if unaligned. if (IsFast) { *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS || - AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ? - (Align % 4 == 0) : true; + AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) + ? (Align % 4 == 0) + : true; } return true; @@ -1096,8 +1091,7 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, - bool MemcpyStrSrc, + bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { // FIXME: Should account for address space here. @@ -1115,8 +1109,7 @@ } static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) { - return AS == AMDGPUASI.GLOBAL_ADDRESS || - AS == AMDGPUASI.FLAT_ADDRESS || + return AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS || AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT; } @@ -1201,21 +1194,21 @@ const ArgDescriptor *InputPtrReg; const TargetRegisterClass *RC; - std::tie(InputPtrReg, RC) - = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); + std::tie(InputPtrReg, RC) = + Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS); - SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, - MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); + SDValue BasePtr = DAG.getCopyFromReg( + Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); return DAG.getObjectPtrOffset(SL, BasePtr, Offset); } SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const { - uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(), - FIRST_IMPLICIT); + uint64_t Offset = + getImplicitParameterOffset(DAG.getMachineFunction(), FIRST_IMPLICIT); return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset); } @@ -1223,8 +1216,7 @@ const SDLoc &SL, SDValue Val, bool Signed, const ISD::InputArg *Arg) const { - if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && - VT.bitsLT(MemVT)) { + if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) { unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT)); } @@ -1240,10 +1232,9 @@ } SDValue SITargetLowering::lowerKernargMemParameter( - SelectionDAG &DAG, EVT VT, EVT MemVT, - const SDLoc &SL, SDValue Chain, - uint64_t Offset, unsigned Align, bool Signed, - const ISD::InputArg *Arg) const { + SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, + uint64_t Offset, unsigned Align, bool Signed, + const ISD::InputArg *Arg) const { Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); @@ -1263,7 +1254,7 @@ SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset); SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4, MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); + MachineMemOperand::MOInvariant); SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32); SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt); @@ -1272,21 +1263,21 @@ ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal); ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg); - - return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL); + return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL); } SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align, MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); + MachineMemOperand::MOInvariant); SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg); - return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); + return DAG.getMergeValues({Val, Load.getValue(1)}, SL); } -SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, - const SDLoc &SL, SDValue Chain, +SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, + CCValAssign &VA, const SDLoc &SL, + SDValue Chain, const ISD::InputArg &Arg) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -1328,16 +1319,14 @@ } ArgValue = DAG.getExtLoad( - ExtType, SL, VA.getLocVT(), Chain, FIN, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), - MemVT); + ExtType, SL, VA.getLocVT(), Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), MemVT); return ArgValue; } -SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG, - const SIMachineFunctionInfo &MFI, - EVT VT, - AMDGPUFunctionArgInfo::PreloadedValue PVID) const { +SDValue SITargetLowering::getPreloadedValue( + SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT, + AMDGPUFunctionArgInfo::PreloadedValue PVID) const { const ArgDescriptor *Reg; const TargetRegisterClass *RC; @@ -1348,8 +1337,7 @@ static void processShaderInputArgs(SmallVectorImpl &Splits, CallingConv::ID CallConv, ArrayRef Ins, - BitVector &Skipped, - FunctionType *FType, + BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info) { for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { const ISD::InputArg *Arg = &Ins[I]; @@ -1358,8 +1346,8 @@ "vector type argument should have been split"); // First check if it's a PS input addr. - if (CallConv == CallingConv::AMDGPU_PS && - !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) { + if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() && + !Arg->Flags.isByVal() && PSInputNum <= 15) { bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum); @@ -1395,8 +1383,7 @@ } // Allocate special inputs passed in VGPRs. -static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, - MachineFunction &MF, +static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) { if (Info.hasWorkItemIDX()) { @@ -1427,8 +1414,8 @@ // Try to allocate a VGPR at the end of the argument list, or if no argument // VGPRs are left allocating a stack slot. static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) { - ArrayRef ArgVGPRs - = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32); + ArrayRef ArgVGPRs = + makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32); unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs); if (RegIdx == ArgVGPRs.size()) { // Spill to stack required. @@ -1471,8 +1458,7 @@ return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); } -static void allocateSpecialInputVGPRs(CCState &CCInfo, - MachineFunction &MF, +static void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) { if (Info.hasWorkItemIDX()) @@ -1485,8 +1471,7 @@ Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo)); } -static void allocateSpecialInputSGPRs(CCState &CCInfo, - MachineFunction &MF, +static void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) { auto &ArgInfo = Info.getArgInfo(); @@ -1521,8 +1506,7 @@ } // Allocate special inputs passed in user SGPRs. -static void allocateHSAUserSGPRs(CCState &CCInfo, - MachineFunction &MF, +static void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) { if (Info.hasImplicitBufferPtr()) { @@ -1573,11 +1557,9 @@ } // Allocate special input registers that are initialized per-wave. -static void allocateSystemSGPRs(CCState &CCInfo, - MachineFunction &MF, +static void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, - CallingConv::ID CallConv, - bool IsShader) { + CallingConv::ID CallConv, bool IsShader) { if (Info.hasWorkGroupIDX()) { unsigned Reg = Info.addWorkGroupIDX(); MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); @@ -1608,7 +1590,7 @@ if (IsShader) { PrivateSegmentWaveByteOffsetReg = - Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); + Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); // This is true if the scratch wave byte offset doesn't have a fixed // location. @@ -1654,8 +1636,8 @@ // resource. For the Code Object V2 ABI, this will be the first 4 user // SGPR inputs. We can reserve those and use them directly. - unsigned PrivateSegmentBufferReg = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); + unsigned PrivateSegmentBufferReg = + Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); Info.setScratchRSrcReg(PrivateSegmentBufferReg); if (MFI.hasCalls()) { @@ -1667,19 +1649,18 @@ // FIXME: Nothing is really ensuring this is a call preserved register, // it's just selected from the end so it happens to be. - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); + unsigned ReservedOffsetReg = + TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); Info.setScratchWaveOffsetReg(ReservedOffsetReg); } else { unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); } } else { - unsigned ReservedBufferReg - = TRI.reservedPrivateSegmentBufferReg(MF); - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); + unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); + unsigned ReservedOffsetReg = + TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); // We tentatively reserve the last registers (skipping the last two // which may contain VCC). After register allocation, we'll replace @@ -1699,11 +1680,11 @@ if (HasStackObjects && !MFI.hasCalls()) { unsigned ScratchWaveOffsetReg = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg); } else { - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); + unsigned ReservedOffsetReg = + TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); Info.setScratchWaveOffsetReg(ReservedOffsetReg); } } @@ -1714,13 +1695,11 @@ return !Info->isEntryFunction(); } -void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { - -} +void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {} void SITargetLowering::insertCopiesSplitCSR( - MachineBasicBlock *Entry, - const SmallVectorImpl &Exits) const { + MachineBasicBlock *Entry, + const SmallVectorImpl &Exits) const { const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); @@ -1743,13 +1722,13 @@ // Create copy from CSR to a virtual register. Entry->addLiveIn(*I); BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) - .addReg(*I); + .addReg(*I); // Insert the copy-back instructions right before the terminator. for (auto *Exit : Exits) BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), TII->get(TargetOpcode::COPY), *I) - .addReg(NewVR); + .addReg(NewVR); } } @@ -1811,8 +1790,8 @@ // enabled too. if (CallConv == CallingConv::AMDGPU_PS) { if ((Info->getPSInputAddr() & 0x7F) == 0 || - ((Info->getPSInputAddr() & 0xF) == 0 && - Info->isPSInputAllocated(11))) { + ((Info->getPSInputAddr() & 0xF) == 0 && + Info->isPSInputAllocated(11))) { CCInfo.AllocateReg(AMDGPU::VGPR0); CCInfo.AllocateReg(AMDGPU::VGPR1); Info->markPSInputAllocated(0); @@ -1820,28 +1799,29 @@ } if (Subtarget->isAmdPalOS()) { // For isAmdPalOS, the user does not enable some bits after compilation - // based on run-time states; the register values being generated here are + // based on run-time states; the register values being generated here + // are // the final ones set in hardware. Therefore we need to apply the - // workaround to PSInputAddr and PSInputEnable together. (The case where + // workaround to PSInputAddr and PSInputEnable together. (The case + // where // a bit is set in PSInputAddr but not PSInputEnable is where the // frontend set up an input arg for a particular interpolation mode, but // nothing uses that input arg. Really we should have an earlier pass // that removes such an arg.) - unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); + unsigned PsInputBits = + Info->getPSInputAddr() & Info->getPSInputEnable(); if ((PsInputBits & 0x7F) == 0 || - ((PsInputBits & 0xF) == 0 && - (PsInputBits >> 11 & 1))) + ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1))) Info->markPSInputEnabled( countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); } } - assert(!Info->hasDispatchPtr() && - !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && - !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && - !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && - !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && - !Info->hasWorkItemIDZ()); + assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && + !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && + !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && + !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && + !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()); } else if (IsKernel) { assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); } else { @@ -1869,7 +1849,7 @@ // kern arg offset. const unsigned KernelArgBaseAlign = 16; - for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { const ISD::InputArg &Arg = Ins[i]; if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) { InVals.push_back(DAG.getUNDEF(Arg.VT)); @@ -1886,12 +1866,13 @@ const uint64_t Offset = VA.getLocMemOffset(); unsigned Align = MinAlign(KernelArgBaseAlign, Offset); - SDValue Arg = lowerKernargMemParameter( - DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]); + SDValue Arg = + lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, Align, + Ins[i].Flags.isSExt(), &Ins[i]); Chains.push_back(Arg.getValue(1)); auto *ParamTy = - dyn_cast(FType->getParamType(Ins[i].getOrigArgIndex())); + dyn_cast(FType->getParamType(Ins[i].getOrigArgIndex())); if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { // On SI local pointers are just offsets into LDS, so they are always @@ -1927,8 +1908,9 @@ // automatically inserted sret (i.e. CanLowerReturn returns false), an // extra copy is inserted in SelectionDAGBuilder which obscures this. unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits; - Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, - DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits))); + Val = DAG.getNode( + ISD::AssertZext, DL, VT, Val, + DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits))); } // If this is an 8 or 16-bit value, it is really passed promoted @@ -1941,13 +1923,11 @@ Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val); break; case CCValAssign::SExt: - Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, - DAG.getValueType(ValVT)); + Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT)); Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); break; case CCValAssign::ZExt: - Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, - DAG.getValueType(ValVT)); + Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT)); Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); break; case CCValAssign::AExt: @@ -1975,24 +1955,21 @@ allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } - auto &ArgUsageInfo = - DAG.getPass()->getAnalysis(); + auto &ArgUsageInfo = DAG.getPass()->getAnalysis(); ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo()); unsigned StackArgSize = CCInfo.getNextStackOffset(); Info->setBytesInStackArgArea(StackArgSize); - return Chains.empty() ? Chain : - DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + return Chains.empty() ? Chain + : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } // TODO: If return values can't fit in registers, we should return as many as // possible in registers before passing on stack. bool SITargetLowering::CanLowerReturn( - CallingConv::ID CallConv, - MachineFunction &MF, bool IsVarArg, - const SmallVectorImpl &Outs, - LLVMContext &Context) const { + CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg, + const SmallVectorImpl &Outs, LLVMContext &Context) const { // Replacing returns with sret/stack usage doesn't make sense for shaders. // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn // for shaders. Vector types should be explicitly handled by CC. @@ -2042,13 +2019,13 @@ if (!Info->isEntryFunction()) { const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); SDValue ReturnAddrReg = CreateLiveInRegister( - DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); + DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); // FIXME: Should be able to use a vreg here, but need a way to prevent it // from being allcoated to a CSR. - SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF), - MVT::i64); + SDValue PhysReturnAddrReg = + DAG.getRegister(TRI->getReturnAddressReg(MF), MVT::i64); Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag); Flag = Chain.getValue(1); @@ -2093,7 +2070,7 @@ if (!Info->isEntryFunction()) { const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = - TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); if (I) { for (; *I; ++I) { if (AMDGPU::SReg_64RegClass.contains(*I)) @@ -2136,7 +2113,8 @@ SDValue Val; if (VA.isRegLoc()) { - Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); + Val = + DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); Chain = Val.getValue(1); InFlag = Val.getValue(2); } else if (VA.isMemLoc()) { @@ -2176,11 +2154,9 @@ // Add code to pass special inputs required depending on used features separate // from the explicit user arguments present in the IR. void SITargetLowering::passSpecialInputs( - CallLoweringInfo &CLI, - const SIMachineFunctionInfo &Info, + CallLoweringInfo &CLI, const SIMachineFunctionInfo &Info, SmallVectorImpl> &RegsToPass, - SmallVectorImpl &MemOpChains, - SDValue Chain, + SmallVectorImpl &MemOpChains, SDValue Chain, SDValue StackPtr) const { // If we don't have a call site, this was a call inserted by // legalization. These can never use special inputs. @@ -2195,10 +2171,9 @@ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); - auto &ArgUsageInfo = - DAG.getPass()->getAnalysis(); - const AMDGPUFunctionArgInfo &CalleeArgInfo - = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc); + auto &ArgUsageInfo = DAG.getPass()->getAnalysis(); + const AMDGPUFunctionArgInfo &CalleeArgInfo = + ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc); const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo(); @@ -2206,18 +2181,17 @@ // the fact that at least in kernels, the input argument is not necessarily // in the same location as the input. AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = { - AMDGPUFunctionArgInfo::DISPATCH_PTR, - AMDGPUFunctionArgInfo::QUEUE_PTR, - AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR, - AMDGPUFunctionArgInfo::DISPATCH_ID, - AMDGPUFunctionArgInfo::WORKGROUP_ID_X, - AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, - AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, - AMDGPUFunctionArgInfo::WORKITEM_ID_X, - AMDGPUFunctionArgInfo::WORKITEM_ID_Y, - AMDGPUFunctionArgInfo::WORKITEM_ID_Z, - AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR - }; + AMDGPUFunctionArgInfo::DISPATCH_PTR, + AMDGPUFunctionArgInfo::QUEUE_PTR, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR, + AMDGPUFunctionArgInfo::DISPATCH_ID, + AMDGPUFunctionArgInfo::WORKGROUP_ID_X, + AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, + AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, + AMDGPUFunctionArgInfo::WORKITEM_ID_X, + AMDGPUFunctionArgInfo::WORKITEM_ID_Y, + AMDGPUFunctionArgInfo::WORKITEM_ID_Z, + AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR}; for (auto InputID : InputRegs) { const ArgDescriptor *OutgoingArg; @@ -2229,8 +2203,8 @@ const ArgDescriptor *IncomingArg; const TargetRegisterClass *IncomingArgRC; - std::tie(IncomingArg, IncomingArgRC) - = CallerArgInfo.getPreloadedValue(InputID); + std::tie(IncomingArg, IncomingArgRC) = + CallerArgInfo.getPreloadedValue(InputID); assert(IncomingArgRC == ArgRC); // All special arguments are ints for now. @@ -2249,9 +2223,8 @@ if (OutgoingArg->isRegister()) { RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); } else { - SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr, - InputReg, - OutgoingArg->getStackOffset()); + SDValue ArgStore = storeStackInputValue( + DAG, DL, Chain, StackPtr, InputReg, OutgoingArg->getStackOffset()); MemOpChains.push_back(ArgStore); } } @@ -2392,16 +2365,16 @@ if (AMDGPU::isShader(MF.getFunction().getCallingConv())) { // Note the issue is with the CC of the calling function, not of the call // itself. - return lowerUnhandledCall(CLI, InVals, - "unsupported call from graphics shader of function "); + return lowerUnhandledCall( + CLI, InVals, "unsupported call from graphics shader of function "); } // The first 4 bytes are reserved for the callee's emergency stack slot. const unsigned CalleeUsableStackOffset = 4; if (IsTailCall) { - IsTailCall = isEligibleForTailCallOptimization( - Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); + IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg, + Outs, OutVals, Ins, DAG); if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) { report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); @@ -2467,20 +2440,20 @@ unsigned OffsetReg = Info->getScratchWaveOffsetReg(); // In the HSA case, this should be an identity copy. - SDValue ScratchRSrcReg - = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); + SDValue ScratchRSrcReg = + DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); // TODO: Don't hardcode these registers and get from the callee function. - SDValue ScratchWaveOffsetReg - = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32); + SDValue ScratchWaveOffsetReg = + DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32); RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg); if (!Info->isEntryFunction()) { // Avoid clobbering this function's FP value. In the current convention // callee will overwrite this, so do save/restore around the call site. - CallerSavedFP = DAG.getCopyFromReg(Chain, DL, - Info->getFrameOffsetReg(), MVT::i32); + CallerSavedFP = + DAG.getCopyFromReg(Chain, DL, Info->getFrameOffsetReg(), MVT::i32); } } @@ -2535,14 +2508,14 @@ if (IsTailCall) { ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; - unsigned OpSize = Flags.isByVal() ? - Flags.getByValSize() : VA.getValVT().getStoreSize(); + unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() + : VA.getValVT().getStoreSize(); Offset = Offset + FPDiff; int FI = MFI.CreateFixedObject(OpSize, Offset, true); - DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT), - StackPtr); + DstAddr = + DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT), StackPtr); DstInfo = MachinePointerInfo::getFixedStack(MF, FI); // Make sure any stack arguments overlapping with where we're storing @@ -2586,12 +2559,11 @@ // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; for (auto &RegToPass : RegsToPass) { - Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, - RegToPass.second, InFlag); + Chain = + DAG.getCopyToReg(Chain, DL, RegToPass.first, RegToPass.second, InFlag); InFlag = Chain.getValue(1); } - SDValue PhysReturnAddrReg; if (IsTailCall) { // Since the return is being combined with the call, we need to pass on the @@ -2599,11 +2571,11 @@ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); SDValue ReturnAddrReg = CreateLiveInRegister( - DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); + DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); - PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF), - MVT::i64); - Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag); + PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF), MVT::i64); + Chain = + DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag); InFlag = Chain.getValue(1); } @@ -2612,10 +2584,9 @@ // we've carefully laid out the parameters so that when sp is reset they'll be // in the correct location. if (IsTailCall && !IsSibCall) { - Chain = DAG.getCALLSEQ_END(Chain, - DAG.getTargetConstant(NumBytes, DL, MVT::i32), - DAG.getTargetConstant(0, DL, MVT::i32), - InFlag, DL); + Chain = + DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(NumBytes, DL, MVT::i32), + DAG.getTargetConstant(0, DL, MVT::i32), InFlag, DL); InFlag = Chain.getValue(1); } @@ -2635,13 +2606,13 @@ // Add argument registers to the end of the list so that they are known live // into the call. for (auto &RegToPass : RegsToPass) { - Ops.push_back(DAG.getRegister(RegToPass.first, - RegToPass.second.getValueType())); + Ops.push_back( + DAG.getRegister(RegToPass.first, RegToPass.second.getValueType())); } // Add a register mask operand representing the call-preserved registers. - auto *TRI = static_cast(Subtarget->getRegisterInfo()); + auto *TRI = static_cast(Subtarget->getRegisterInfo()); const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -2670,9 +2641,9 @@ } uint64_t CalleePopBytes = NumBytes; - Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32), - DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32), - InFlag, DL); + Chain = DAG.getCALLSEQ_END( + Chain, DAG.getTargetConstant(0, DL, MVT::i32), + DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32), InFlag, DL); if (!Ins.empty()) InFlag = Chain.getValue(1); @@ -2683,28 +2654,27 @@ IsThisReturn ? OutVals[0] : SDValue()); } -unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, +unsigned SITargetLowering::getRegisterByName(const char *RegName, EVT VT, SelectionDAG &DAG) const { unsigned Reg = StringSwitch(RegName) - .Case("m0", AMDGPU::M0) - .Case("exec", AMDGPU::EXEC) - .Case("exec_lo", AMDGPU::EXEC_LO) - .Case("exec_hi", AMDGPU::EXEC_HI) - .Case("flat_scratch", AMDGPU::FLAT_SCR) - .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) - .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) - .Default(AMDGPU::NoRegister); + .Case("m0", AMDGPU::M0) + .Case("exec", AMDGPU::EXEC) + .Case("exec_lo", AMDGPU::EXEC_LO) + .Case("exec_hi", AMDGPU::EXEC_HI) + .Case("flat_scratch", AMDGPU::FLAT_SCR) + .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) + .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) + .Default(AMDGPU::NoRegister); if (Reg == AMDGPU::NoRegister) { - report_fatal_error(Twine("invalid register name \"" - + StringRef(RegName) + "\".")); - + report_fatal_error( + Twine("invalid register name \"" + StringRef(RegName) + "\".")); } if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { - report_fatal_error(Twine("invalid register \"" - + StringRef(RegName) + "\" for subtarget.")); + report_fatal_error(Twine("invalid register \"" + StringRef(RegName) + + "\" for subtarget.")); } switch (Reg) { @@ -2725,14 +2695,15 @@ llvm_unreachable("missing register type checking"); } - report_fatal_error(Twine("invalid type for register \"" - + StringRef(RegName) + "\".")); + report_fatal_error( + Twine("invalid type for register \"" + StringRef(RegName) + "\".")); } // If kill is not the last instruction, split the block so kill is always a // proper terminator. -MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, - MachineBasicBlock *BB) const { +MachineBasicBlock * +SITargetLowering::splitKillBlock(MachineInstr &MI, + MachineBasicBlock *BB) const { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); MachineBasicBlock::iterator SplitPoint(&MI); @@ -2745,8 +2716,7 @@ } MachineFunction *MF = BB->getParent(); - MachineBasicBlock *SplitBB - = MF->CreateMachineBasicBlock(BB->getBasicBlock()); + MachineBasicBlock *SplitBB = MF->CreateMachineBasicBlock(BB->getBasicBlock()); MF->insert(++MachineFunction::iterator(BB), SplitBB); SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end()); @@ -2763,20 +2733,13 @@ // will only do one iteration. In the worst case, this will loop 64 times. // // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value. -static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( - const SIInstrInfo *TII, - MachineRegisterInfo &MRI, - MachineBasicBlock &OrigBB, - MachineBasicBlock &LoopBB, - const DebugLoc &DL, - const MachineOperand &IdxReg, - unsigned InitReg, - unsigned ResultReg, - unsigned PhiReg, - unsigned InitSaveExecReg, - int Offset, - bool UseGPRIdxMode, - bool IsIndirectSrc) { +static MachineBasicBlock::iterator +emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, + MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, + const DebugLoc &DL, const MachineOperand &IdxReg, + unsigned InitReg, unsigned ResultReg, unsigned PhiReg, + unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, + bool IsIndirectSrc) { MachineBasicBlock::iterator I = LoopBB.begin(); unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); @@ -2785,29 +2748,29 @@ unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) - .addReg(InitReg) - .addMBB(&OrigBB) - .addReg(ResultReg) - .addMBB(&LoopBB); + .addReg(InitReg) + .addMBB(&OrigBB) + .addReg(ResultReg) + .addMBB(&LoopBB); BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec) - .addReg(InitSaveExecReg) - .addMBB(&OrigBB) - .addReg(NewExec) - .addMBB(&LoopBB); + .addReg(InitSaveExecReg) + .addMBB(&OrigBB) + .addReg(NewExec) + .addMBB(&LoopBB); // Read the next variant <- also loop target. BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg) - .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); + .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); // Compare the just read M0 value to all possible Idx values. BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg) - .addReg(CurrentIdxReg) - .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg()); + .addReg(CurrentIdxReg) + .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg()); // Update EXEC, save the original EXEC value to VCC. BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec) - .addReg(CondReg, RegState::Kill); + .addReg(CondReg, RegState::Kill); MRI.setSimpleHint(NewExec, CondReg); @@ -2818,40 +2781,39 @@ } else { IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg) - .addReg(CurrentIdxReg, RegState::Kill) - .addImm(Offset); + .addReg(CurrentIdxReg, RegState::Kill) + .addImm(Offset); } - unsigned IdxMode = IsIndirectSrc ? - VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; + unsigned IdxMode = + IsIndirectSrc ? VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; MachineInstr *SetOn = - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) - .addReg(IdxReg, RegState::Kill) - .addImm(IdxMode); + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) + .addReg(IdxReg, RegState::Kill) + .addImm(IdxMode); SetOn->getOperand(3).setIsUndef(); } else { // Move index from VCC into M0 if (Offset == 0) { BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(CurrentIdxReg, RegState::Kill); + .addReg(CurrentIdxReg, RegState::Kill); } else { BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addReg(CurrentIdxReg, RegState::Kill) - .addImm(Offset); + .addReg(CurrentIdxReg, RegState::Kill) + .addImm(Offset); } } // Update EXEC, switch all done bits to 0 and all todo bits to 1. MachineInstr *InsertPt = - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(NewExec); + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(NewExec); // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use // s_cbranch_scc0? // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addMBB(&LoopBB); + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); return InsertPt->getIterator(); } @@ -2861,14 +2823,10 @@ // per-workitem, so is kept alive for the whole loop so we end up not re-using a // subregister from it, using 1 more VGPR than necessary. This was saved when // this was expanded after register allocation. -static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, - MachineBasicBlock &MBB, - MachineInstr &MI, - unsigned InitResultReg, - unsigned PhiReg, - int Offset, - bool UseGPRIdxMode, - bool IsIndirectSrc) { +static MachineBasicBlock::iterator +loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, + unsigned InitResultReg, unsigned PhiReg, int Offset, + bool UseGPRIdxMode, bool IsIndirectSrc) { MachineFunction *MF = MBB.getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); const DebugLoc &DL = MI.getDebugLoc(); @@ -2882,7 +2840,7 @@ // Save the EXEC mask BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec) - .addReg(AMDGPU::EXEC); + .addReg(AMDGPU::EXEC); // To insert the loop we need to split the block. Move everything after this // point to a new block, and insert a new empty block between the two. @@ -2911,7 +2869,7 @@ MachineBasicBlock::iterator First = RemainderBB->begin(); BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addReg(SaveExec); + .addReg(SaveExec); return InsPt; } @@ -2919,8 +2877,7 @@ // Returns subreg index, offset static std::pair computeIndirectRegAndOffset(const SIRegisterInfo &TRI, - const TargetRegisterClass *SuperRC, - unsigned VecReg, + const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset) { int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32; @@ -2934,10 +2891,8 @@ // Return true if the index is an SGPR and was set. static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, - MachineRegisterInfo &MRI, - MachineInstr &MI, - int Offset, - bool UseGPRIdxMode, + MachineRegisterInfo &MRI, MachineInstr &MI, + int Offset, bool UseGPRIdxMode, bool IsIndirectSrc) { MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -2952,8 +2907,8 @@ return false; if (UseGPRIdxMode) { - unsigned IdxMode = IsIndirectSrc ? - VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; + unsigned IdxMode = + IsIndirectSrc ? VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; if (Offset == 0) { MachineInstr *SetOn = BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) @@ -2967,9 +2922,9 @@ .add(*Idx) .addImm(Offset); MachineInstr *SetOn = - BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) - .addReg(Tmp, RegState::Kill) - .addImm(IdxMode); + BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) + .addReg(Tmp, RegState::Kill) + .addImm(IdxMode); SetOn->getOperand(3).setIsUndef(); } @@ -2978,12 +2933,11 @@ } if (Offset == 0) { - BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .add(*Idx); + BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx); } else { BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .add(*Idx) - .addImm(Offset); + .add(*Idx) + .addImm(Offset); } return true; @@ -3005,8 +2959,8 @@ const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg); unsigned SubReg; - std::tie(SubReg, Offset) - = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset); + std::tie(SubReg, Offset) = + computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset); bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode); @@ -3019,14 +2973,14 @@ // to avoid interfering with other uses, so probably requires a new // optimization pass. BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) - .addReg(SrcReg, RegState::Undef, SubReg) - .addReg(SrcReg, RegState::Implicit) - .addReg(AMDGPU::M0, RegState::Implicit); + .addReg(SrcReg, RegState::Undef, SubReg) + .addReg(SrcReg, RegState::Implicit) + .addReg(AMDGPU::M0, RegState::Implicit); BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); } else { BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) - .addReg(SrcReg, RegState::Undef, SubReg) - .addReg(SrcReg, RegState::Implicit); + .addReg(SrcReg, RegState::Undef, SubReg) + .addReg(SrcReg, RegState::Implicit); } MI.eraseFromParent(); @@ -3042,20 +2996,20 @@ BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg); - auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, - Offset, UseGPRIdxMode, true); + auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, + UseGPRIdxMode, true); MachineBasicBlock *LoopBB = InsPt->getParent(); if (UseGPRIdxMode) { BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) - .addReg(SrcReg, RegState::Undef, SubReg) - .addReg(SrcReg, RegState::Implicit) - .addReg(AMDGPU::M0, RegState::Implicit); + .addReg(SrcReg, RegState::Undef, SubReg) + .addReg(SrcReg, RegState::Implicit) + .addReg(AMDGPU::M0, RegState::Implicit); BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); } else { BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) - .addReg(SrcReg, RegState::Undef, SubReg) - .addReg(SrcReg, RegState::Implicit); + .addReg(SrcReg, RegState::Undef, SubReg) + .addReg(SrcReg, RegState::Implicit); } MI.eraseFromParent(); @@ -3100,9 +3054,8 @@ assert(Val->getReg()); unsigned SubReg; - std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC, - SrcVec->getReg(), - Offset); + std::tie(SubReg, Offset) = + computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset); bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode); if (Idx->getReg() == AMDGPU::NoRegister) { @@ -3154,8 +3107,8 @@ unsigned PhiReg = MRI.createVirtualRegister(VecRC); - auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, - Offset, UseGPRIdxMode, false); + auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset, + UseGPRIdxMode, false); MachineBasicBlock *LoopBB = InsPt->getParent(); if (UseGPRIdxMode) { @@ -3181,8 +3134,9 @@ return LoopBB; } -MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( - MachineInstr &MI, MachineBasicBlock *BB) const { +MachineBasicBlock * +SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *BB) const { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); MachineFunction *MF = BB->getParent(); @@ -3211,35 +3165,31 @@ unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, - Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0, - &AMDGPU::SReg_32_XM0RegClass); - MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, - Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1, - &AMDGPU::SReg_32_XM0RegClass); + MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0, + &AMDGPU::SReg_32_XM0RegClass); + MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1, + &AMDGPU::SReg_32_XM0RegClass); - MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, - Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0, - &AMDGPU::SReg_32_XM0RegClass); - MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, - Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1, - &AMDGPU::SReg_32_XM0RegClass); + MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0, + &AMDGPU::SReg_32_XM0RegClass); + MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1, + &AMDGPU::SReg_32_XM0RegClass); bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; - BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) - .add(Src0Sub0) - .add(Src1Sub0); - BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) - .add(Src0Sub1) - .add(Src1Sub1); + BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0); + BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1); BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) - .addReg(DestSub0) - .addImm(AMDGPU::sub0) - .addReg(DestSub1) - .addImm(AMDGPU::sub1); + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); MI.eraseFromParent(); return BB; } @@ -3294,8 +3244,7 @@ BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg) .addReg(InputReg) .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000); - BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64), - AMDGPU::EXEC) + BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64), AMDGPU::EXEC) .addReg(CountReg) .addImm(0); BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32)) @@ -3342,24 +3291,24 @@ unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned SrcCondCopy = + MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy) - .addReg(SrcCond); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond); BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) - .addReg(Src0, 0, AMDGPU::sub0) - .addReg(Src1, 0, AMDGPU::sub0) - .addReg(SrcCondCopy); + .addReg(Src0, 0, AMDGPU::sub0) + .addReg(Src1, 0, AMDGPU::sub0) + .addReg(SrcCondCopy); BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) - .addReg(Src0, 0, AMDGPU::sub1) - .addReg(Src1, 0, AMDGPU::sub1) - .addReg(SrcCondCopy); + .addReg(Src0, 0, AMDGPU::sub1) + .addReg(Src1, 0, AMDGPU::sub1) + .addReg(SrcCondCopy); BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst) - .addReg(DstLo) - .addImm(AMDGPU::sub0) - .addReg(DstHi) - .addImm(AMDGPU::sub1); + .addReg(DstLo) + .addImm(AMDGPU::sub0) + .addReg(DstHi) + .addImm(AMDGPU::sub1); MI.eraseFromParent(); return BB; } @@ -3401,14 +3350,15 @@ MachineInstrBuilder MIB; if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg) - .add(MI.getOperand(0)) - .addGlobalAddress(G); + .add(MI.getOperand(0)) + .addGlobalAddress(G); } else { MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN)) - .add(MI.getOperand(0)) - .addGlobalAddress(G); + .add(MI.getOperand(0)) + .addGlobalAddress(G); - // There is an additional imm operand for tcreturn, but it should be in the + // There is an additional imm operand for tcreturn, but it should be in + // the // right place already. } @@ -3510,10 +3460,8 @@ std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0); SDLoc SL(Op); - SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, - Op->getFlags()); - SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, - Op->getFlags()); + SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags()); + SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags()); return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); } @@ -3533,22 +3481,23 @@ SDLoc SL(Op); - SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, - Op->getFlags()); - SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, - Op->getFlags()); + SDValue OpLo = + DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags()); + SDValue OpHi = + DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags()); return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); } SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { - default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::BRCOND: return LowerBRCOND(Op, DAG); + default: + return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case ISD::BRCOND: + return LowerBRCOND(Op, DAG); case ISD::LOAD: { SDValue Result = LowerLOAD(Op, DAG); - assert((!Result.getNode() || - Result.getNode()->getNumValues() == 2) && + assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) && "Load should return a value and a chain"); return Result; } @@ -3556,19 +3505,27 @@ case ISD::FSIN: case ISD::FCOS: return LowerTrig(Op, DAG); - case ISD::SELECT: return LowerSELECT(Op, DAG); - case ISD::FDIV: return LowerFDIV(Op, DAG); - case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); - case ISD::STORE: return LowerSTORE(Op, DAG); + case ISD::SELECT: + return LowerSELECT(Op, DAG); + case ISD::FDIV: + return LowerFDIV(Op, DAG); + case ISD::ATOMIC_CMP_SWAP: + return LowerATOMIC_CMP_SWAP(Op, DAG); + case ISD::STORE: + return LowerSTORE(Op, DAG); case ISD::GlobalAddress: { MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *MFI = MF.getInfo(); return LowerGlobalAddress(MFI, Op, DAG); } - case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); - case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); - case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); - case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: + return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::INTRINSIC_W_CHAIN: + return LowerINTRINSIC_W_CHAIN(Op, DAG); + case ISD::INTRINSIC_VOID: + return LowerINTRINSIC_VOID(Op, DAG); + case ISD::ADDRSPACECAST: + return lowerADDRSPACECAST(Op, DAG); case ISD::INSERT_VECTOR_ELT: return lowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: @@ -3605,8 +3562,8 @@ } static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, - const SDLoc &DL, - SelectionDAG &DAG, bool Unpacked) { + const SDLoc &DL, SelectionDAG &DAG, + bool Unpacked) { if (!LoadVT.isVector()) return Result; @@ -3631,8 +3588,7 @@ return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result); } -SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, - MemSDNode *M, +SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M, SelectionDAG &DAG, ArrayRef Ops, bool IsIntrinsic) const { @@ -3643,25 +3599,24 @@ EVT EquivLoadVT = LoadVT; if (Unpacked && LoadVT.isVector()) { - EquivLoadVT = LoadVT.isVector() ? - EVT::getVectorVT(*DAG.getContext(), MVT::i32, - LoadVT.getVectorNumElements()) : LoadVT; + EquivLoadVT = LoadVT.isVector() + ? EVT::getVectorVT(*DAG.getContext(), MVT::i32, + LoadVT.getVectorNumElements()) + : LoadVT; } // Change from v4f16/v2f16 to EquivLoadVT. SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other); - SDValue Load - = DAG.getMemIntrinsicNode( - IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, - VTList, Ops, M->getMemoryVT(), - M->getMemOperand()); + SDValue Load = DAG.getMemIntrinsicNode( + IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops, + M->getMemoryVT(), M->getMemOperand()); if (!Unpacked) // Just adjusted the opcode. return Load; SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked); - return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL); + return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL); } void SITargetLowering::ReplaceNodeResults(SDNode *N, @@ -3685,8 +3640,8 @@ SDValue Src0 = N->getOperand(1); SDValue Src1 = N->getOperand(2); SDLoc SL(N); - SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, - Src0, Src1); + SDValue Cvt = + DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1); Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt)); return; } @@ -3743,8 +3698,8 @@ SelectVT = MVT::i32; } - SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT, - N->getOperand(0), LHS, RHS); + SDValue NewSelect = + DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS); if (NewVT != SelectVT) NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect); @@ -3758,8 +3713,7 @@ SDLoc SL(N); SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); - SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, - BC, + SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC, DAG.getConstant(0x80008000, SL, MVT::i32)); Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); return; @@ -3771,8 +3725,7 @@ SDLoc SL(N); SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); - SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, - BC, + SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC, DAG.getConstant(0x7fff7fff, SL, MVT::i32)); Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); return; @@ -3848,14 +3801,16 @@ bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { const Triple &TT = getTargetMachine().getTargetTriple(); return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || - GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && + GV->getType()->getAddressSpace() == + AMDGPUASI.CONSTANT_ADDRESS_32BIT) && AMDGPU::shouldEmitConstantsToTextSection(TT); } bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || - GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && + GV->getType()->getAddressSpace() == + AMDGPUASI.CONSTANT_ADDRESS_32BIT) && !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); } @@ -3866,8 +3821,7 @@ /// This transforms the control flow intrinsics to get the branch destination as /// last parameter, also switches branch target with BR if the need arise -SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, - SelectionDAG &DAG) const { +SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const { SDLoc DL(BRCOND); SDNode *Intr = BRCOND.getOperand(1).getNode(); @@ -3891,7 +3845,8 @@ // e.g. llvm.amdgcn.loop // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3 - // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch + // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, + // BasicBlock:ch unsigned CFNode = isCFIntrinsic(Intr); if (CFNode == 0) { @@ -3903,16 +3858,16 @@ Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN; assert(!SetCC || - (SetCC->getConstantOperandVal(1) == 1 && - cast(SetCC->getOperand(2).getNode())->get() == - ISD::SETNE)); + (SetCC->getConstantOperandVal(1) == 1 && + cast(SetCC->getOperand(2).getNode())->get() == + ISD::SETNE)); // operands of the new intrinsic call SmallVector Ops; if (HaveChain) Ops.push_back(BRCOND.getOperand(0)); - Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end()); + Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end()); Ops.push_back(Target); ArrayRef Res(Intr->value_begin() + 1, Intr->value_end()); @@ -3921,20 +3876,14 @@ SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode(); if (!HaveChain) { - SDValue Ops[] = { - SDValue(Result, 0), - BRCOND.getOperand(0) - }; + SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)}; Result = DAG.getMergeValues(Ops, DL).getNode(); } if (BR) { // Give the branch instruction our target - SDValue Ops[] = { - BR->getOperand(0), - BRCOND.getOperand(2) - }; + SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)}; SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); BR = NewBR.getNode(); @@ -3948,30 +3897,23 @@ if (!CopyToReg) continue; - Chain = DAG.getCopyToReg( - Chain, DL, - CopyToReg->getOperand(1), - SDValue(Result, i - 1), - SDValue()); + Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1), + SDValue(Result, i - 1), SDValue()); DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); } // Remove the old intrinsic from the chain - DAG.ReplaceAllUsesOfValueWith( - SDValue(Intr, Intr->getNumValues() - 1), - Intr->getOperand(0)); + DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1), + Intr->getOperand(0)); return Chain; } -SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG, - SDValue Op, - const SDLoc &DL, - EVT VT) const { - return Op.getValueType().bitsLE(VT) ? - DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) : - DAG.getNode(ISD::FTRUNC, DL, VT, Op); +SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG, SDValue Op, + const SDLoc &DL, EVT VT) const { + return Op.getValueType().bitsLE(VT) ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) + : DAG.getNode(ISD::FTRUNC, DL, VT, Op); } SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { @@ -4002,17 +3944,13 @@ SIMachineFunctionInfo *Info = MF.getInfo(); unsigned UserSGPR = Info->getQueuePtrUserSGPR(); assert(UserSGPR != AMDGPU::NoRegister); - SDValue QueuePtr = CreateLiveInRegister( - DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); + SDValue QueuePtr = + CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); - SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, - QueuePtr, SDValue()); + SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue()); SDValue Ops[] = { - ToReg, - DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16), - SGPR01, - ToReg.getValue(1) - }; + ToReg, DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16), + SGPR01, ToReg.getValue(1)}; return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); } @@ -4025,17 +3963,14 @@ !Subtarget->isTrapHandlerEnabled()) { DiagnosticInfoUnsupported NoTrap(MF.getFunction(), "debugtrap handler not supported", - Op.getDebugLoc(), - DS_Warning); + Op.getDebugLoc(), DS_Warning); LLVMContext &Ctx = MF.getFunction().getContext(); Ctx.diagnose(NoTrap); return Chain; } - SDValue Ops[] = { - Chain, - DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16) - }; + SDValue Ops[] = {Chain, DAG.getTargetConstant( + GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)}; return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); } @@ -4043,16 +3978,16 @@ SelectionDAG &DAG) const { // FIXME: Use inline constants (src_{shared, private}_base) instead. if (Subtarget->hasApertureRegs()) { - unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ? - AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : - AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; - unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ? - AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : - AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; - unsigned Encoding = - AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | - Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | - WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; + unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS + ? AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE + : AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; + unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS + ? AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE + : AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; + unsigned Encoding = AMDGPU::Hwreg::ID_MEM_BASES + << AMDGPU::Hwreg::ID_SHIFT_ | + Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | + WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16); SDValue ApertureReg = SDValue( @@ -4066,8 +4001,8 @@ unsigned UserSGPR = Info->getQueuePtrUserSGPR(); assert(UserSGPR != AMDGPU::NoRegister); - SDValue QueuePtr = CreateLiveInRegister( - DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); + SDValue QueuePtr = + CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); // Offset into amd_queue_t for group_segment_aperture_base_hi / // private_segment_aperture_base_hi. @@ -4078,8 +4013,8 @@ // TODO: Use custom target PseudoSourceValue. // TODO: We should use the value from the IR intrinsic call, but it might not // be available and how do we get it? - Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()), - AMDGPUASI.CONSTANT_ADDRESS)); + Value *V = UndefValue::get(PointerType::get( + Type::getInt8Ty(*DAG.getContext()), AMDGPUASI.CONSTANT_ADDRESS)); MachinePointerInfo PtrInfo(V, StructOffset); return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo, @@ -4097,7 +4032,7 @@ SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); const AMDGPUTargetMachine &TM = - static_cast(getTargetMachine()); + static_cast(getTargetMachine()); // flat -> local/private if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) { @@ -4110,8 +4045,8 @@ SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); - return DAG.getNode(ISD::SELECT, SL, MVT::i32, - NonNull, Ptr, SegmentNullPtr); + return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr, + SegmentNullPtr); } } @@ -4124,12 +4059,12 @@ unsigned NullVal = TM.getNullPointerValue(SrcAS); SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); - SDValue NonNull - = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); + SDValue NonNull = + DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG); - SDValue CvtPtr - = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); + SDValue CvtPtr = + DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr), @@ -4141,7 +4076,7 @@ const MachineFunction &MF = DAG.getMachineFunction(); DiagnosticInfoUnsupported InvalidAddrSpaceCast( - MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc()); + MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc()); DAG.getContext()->diagnose(InvalidAddrSpaceCast); return DAG.getUNDEF(ASC->getValueType(0)); @@ -4157,7 +4092,6 @@ unsigned VecSize = VecVT.getSizeInBits(); unsigned EltSize = EltVT.getSizeInBits(); - assert(VecSize <= 64); unsigned NumElts = VecVT.getVectorNumElements(); @@ -4177,16 +4111,16 @@ unsigned Idx = KIdx->getZExtValue(); bool InsertLo = Idx < 2; - SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, - InsertLo ? LoVec : HiVec, - DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal), - DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32)); + SDValue InsHalf = DAG.getNode( + ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec, + DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal), + DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32)); InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf); - SDValue Concat = InsertLo ? - DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) : - DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf }); + SDValue Concat = + InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf}) + : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf}); return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat); } @@ -4199,7 +4133,7 @@ // Avoid stack access for dynamic indexing. SDValue Val = InsVal; if (InsVal.getValueType() == MVT::f16) - Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal); + Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal); // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val); @@ -4212,12 +4146,11 @@ SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT, - DAG.getConstant(0xffff, SL, IntVT), - ScaledIdx); + DAG.getConstant(0xffff, SL, IntVT), ScaledIdx); SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal); - SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT, - DAG.getNOT(SL, BFM, IntVT), BCVec); + SDValue RHS = + DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec); SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS); return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI); @@ -4274,15 +4207,15 @@ // Turn into pair of packed build_vectors. // TODO: Special case for constants that can be materialized with s_mov_b64. - SDValue Lo = DAG.getBuildVector(HalfVT, SL, - { Op.getOperand(0), Op.getOperand(1) }); - SDValue Hi = DAG.getBuildVector(HalfVT, SL, - { Op.getOperand(2), Op.getOperand(3) }); + SDValue Lo = + DAG.getBuildVector(HalfVT, SL, {Op.getOperand(0), Op.getOperand(1)}); + SDValue Hi = + DAG.getBuildVector(HalfVT, SL, {Op.getOperand(2), Op.getOperand(3)}); SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo); SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi); - SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi }); + SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, {CastLo, CastHi}); return DAG.getNode(ISD::BITCAST, SL, VT, Blend); } @@ -4314,8 +4247,8 @@ return DAG.getNode(ISD::BITCAST, SL, VT, Or); } -bool -SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { +bool SITargetLowering::isOffsetFoldingLegal( + const GlobalAddressSDNode *GA) const { // We can fold offsets for anything that doesn't require a GOT relocation. return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || @@ -4357,11 +4290,11 @@ // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too // small. This requires us to add 4 to the global variable offset in order to // compute the correct address. - SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, - GAFlags); - SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, - GAFlags == SIInstrInfo::MO_NONE ? - GAFlags : GAFlags + 1); + SDValue PtrLo = + DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags); + SDValue PtrHi = DAG.getTargetGlobalAddress( + GV, DL, MVT::i32, Offset + 4, + GAFlags == SIInstrInfo::MO_NONE ? GAFlags : GAFlags + 1); return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi); } @@ -4419,13 +4352,12 @@ return SDValue(M0, 0); } -SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, - SDValue Op, +SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, MVT VT, unsigned Offset) const { SDLoc SL(Op); - SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL, - DAG.getEntryNode(), Offset, 4, false); + SDValue Param = lowerKernargMemParameter( + DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, 4, false); // The local size values will have the hi 16-bits as zero. return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, DAG.getValueType(VT)); @@ -4513,6 +4445,7 @@ SelectionDAG &DAG) const { SDLoc DL(Op); MachineFunction &MF = DAG.getMachineFunction(); + const GCNSubtarget *ST = &MF.getSubtarget(); const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); @@ -4522,6 +4455,7 @@ SmallVector ResultTypes(Op->value_begin(), Op->value_end()); bool IsD16 = false; + bool IsA16 = false; SDValue VData; int NumVDataDwords; unsigned AddrIdx; // Index of first address argument @@ -4597,25 +4531,63 @@ } } - unsigned NumVAddrs = BaseOpcode->NumExtraArgs + - (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) + - (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) + - (BaseOpcode->LodOrClampOrMip ? 1 : 0); + unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0; + unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0; + unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0; + unsigned NumVAddrs = + BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM; + unsigned NumMIVAddrs = NumVAddrs; + SmallVector VAddrs; - for (unsigned i = 0; i < NumVAddrs; ++i) - VAddrs.push_back(Op.getOperand(AddrIdx + i)); - // Optimize _L to _LZ when _L is zero + // Optimize _L to _LZ when _L is zero. if (LZMappingInfo) { - if (auto ConstantLod = - dyn_cast(VAddrs[NumVAddrs-1].getNode())) { + if (auto ConstantLod = dyn_cast( + Op.getOperand(AddrIdx + NumVAddrs - 1))) { if (ConstantLod->isZero() || ConstantLod->isNegative()) { - IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l - VAddrs.pop_back(); // remove 'lod' + IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l + NumMIVAddrs--; // remove lod } } } + // Check for 16 bit addresses and pack if true. + unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; + MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType(); + if (VAddrVT.getScalarType() == MVT::f16 && + ST->hasFeature(AMDGPU::FeatureR128A16)) { + IsA16 = true; + for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) { + SDValue AddrLo, AddrHi; + // Push back extra arguments. + if (i < DimIdx) { + AddrLo = Op.getOperand(i); + } else { + AddrLo = Op.getOperand(i); + // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, + // in 1D, derivatives dx/dh and dx/dv are packed with undef. + if (((i + 1) >= (AddrIdx + NumMIVAddrs)) || + ((NumGradients / 2) % 2 == 1 && + (i == DimIdx + (NumGradients / 2) - 1 || + i == DimIdx + NumGradients - 1))) { + /* (NumGradients == 2 && (i < DimIdx + NumGradients)) || + (NumGradients == 6 && (DimZIdx == 3 || DimZIdx == 6))) {*/ + AddrHi = DAG.getUNDEF(MVT::f16); + } else { + AddrHi = Op.getOperand(i + 1); + i++; + } + AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f16, + {AddrLo, AddrHi}); + AddrLo = DAG.getBitcast(MVT::i32, AddrLo); + } + VAddrs.push_back(AddrLo); + } + } else { + for (unsigned i = 0; i < NumMIVAddrs; ++i) + VAddrs.push_back(Op.getOperand(AddrIdx + i)); + } + SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs); SDValue True = DAG.getTargetConstant(1, DL, MVT::i1); @@ -4630,16 +4602,13 @@ dyn_cast(Op.getOperand(AddrIdx + NumVAddrs + 2)); if (!UnormConst) return Op; - Unorm = UnormConst->getZExtValue() ? True : False; CtrlIdx = AddrIdx + NumVAddrs + 3; } - SDValue TexFail = Op.getOperand(CtrlIdx); auto TexFailConst = dyn_cast(TexFail.getNode()); if (!TexFailConst || TexFailConst->getZExtValue() != 0) return Op; - SDValue GLC; SDValue SLC; if (BaseOpcode->Atomic) { @@ -4650,7 +4619,6 @@ if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC)) return Op; } - SmallVector Ops; if (BaseOpcode->Store || BaseOpcode->Atomic) Ops.push_back(VData); // vdata @@ -4662,7 +4630,10 @@ Ops.push_back(Unorm); Ops.push_back(GLC); Ops.push_back(SLC); - Ops.push_back(False); // r128 + Ops.push_back(IsA16 && // r128 or a16 + ST->hasFeature(AMDGPU::FeatureR128A16) + ? True + : False); Ops.push_back(False); // tfe Ops.push_back(False); // lwe Ops.push_back(DimInfo->DA ? True : False); @@ -4731,8 +4702,9 @@ return DAG.getUNDEF(VT); } - auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? - AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR; + auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr + ? AMDGPUFunctionArgInfo::DISPATCH_PTR + : AMDGPUFunctionArgInfo::QUEUE_PTR; return getPreloadedValue(DAG, *MFI, VT, RegID); } case Intrinsic::amdgcn_implicitarg_ptr: { @@ -4770,8 +4742,8 @@ APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); - SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, - DAG.getConstantFP(Max, DL, VT)); + SDValue Tmp = + DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT)); return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, DAG.getConstantFP(Min, DL, VT)); } @@ -4780,37 +4752,43 @@ return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_X, 4, false); + SI::KernelInputOffsets::NGROUPS_X, 4, + false); case Intrinsic::r600_read_ngroups_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Y, 4, false); + SI::KernelInputOffsets::NGROUPS_Y, 4, + false); case Intrinsic::r600_read_ngroups_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Z, 4, false); + SI::KernelInputOffsets::NGROUPS_Z, 4, + false); case Intrinsic::r600_read_global_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false); + SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, + false); case Intrinsic::r600_read_global_size_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false); + SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, + false); case Intrinsic::r600_read_global_size_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false); + SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, + false); case Intrinsic::r600_read_local_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); @@ -4858,10 +4836,7 @@ SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDZ); case AMDGPUIntrinsic::SI_load_const: { - SDValue Ops[] = { - Op.getOperand(1), - Op.getOperand(2) - }; + SDValue Ops[] = {Op.getOperand(1), Op.getOperand(2)}; MachineMemOperand *MMO = MF.getMachineMemOperand( MachinePointerInfo(), @@ -4902,39 +4877,38 @@ if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) return SDValue(); - DiagnosticInfoUnsupported BadIntrin( - MF.getFunction(), "intrinsic not supported on subtarget", - DL.getDebugLoc()); - DAG.getContext()->diagnose(BadIntrin); - return DAG.getUNDEF(VT); + DiagnosticInfoUnsupported BadIntrin(MF.getFunction(), + "intrinsic not supported on subtarget", + DL.getDebugLoc()); + DAG.getContext()->diagnose(BadIntrin); + return DAG.getUNDEF(VT); } case Intrinsic::amdgcn_ldexp: - return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, - Op.getOperand(1), Op.getOperand(2)); + return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1), + Op.getOperand(2)); case Intrinsic::amdgcn_fract: return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); case Intrinsic::amdgcn_class: - return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, - Op.getOperand(1), Op.getOperand(2)); + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1), + Op.getOperand(2)); case Intrinsic::amdgcn_div_fmas: - return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), - Op.getOperand(4)); + return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); case Intrinsic::amdgcn_div_fixup: - return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_trig_preop: - return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, - Op.getOperand(1), Op.getOperand(2)); + return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, Op.getOperand(1), + Op.getOperand(2)); case Intrinsic::amdgcn_div_scale: { // 3rd parameter required to be a constant. const ConstantSDNode *Param = dyn_cast(Op.getOperand(3)); if (!Param) - return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL); + return DAG.getMergeValues({DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1)}, DL); // Translate to the operands expected by the machine instruction. The // first parameter must be the same as the first instruction. @@ -4982,23 +4956,22 @@ Op.getOperand(2), DAG.getCondCode(CCOpcode)); } case Intrinsic::amdgcn_fmed3: - return DAG.getNode(AMDGPUISD::FMED3, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_fdot2: - return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), - Op.getOperand(4)); + return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); case Intrinsic::amdgcn_fmul_legacy: - return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, - Op.getOperand(1), Op.getOperand(2)); + return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1), + Op.getOperand(2)); case Intrinsic::amdgcn_sffbh: return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1)); case Intrinsic::amdgcn_sbfe: - return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_ubfe: - return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_cvt_pkrtz: case Intrinsic::amdgcn_cvt_pknorm_i16: case Intrinsic::amdgcn_cvt_pknorm_u16: @@ -5022,8 +4995,8 @@ if (isTypeLegal(VT)) return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2)); - SDValue Node = DAG.getNode(Opcode, DL, MVT::i32, - Op.getOperand(1), Op.getOperand(2)); + SDValue Node = + DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2)); return DAG.getNode(ISD::BITCAST, DL, VT, Node); } case Intrinsic::amdgcn_wqm: { @@ -5081,9 +5054,9 @@ llvm_unreachable("Unknown intrinsic!"); } SDValue Ops[] = { - M->getOperand(0), // Chain - M->getOperand(2), // Ptr - M->getOperand(3) // Value + M->getOperand(0), // Chain + M->getOperand(2), // Ptr + M->getOperand(3) // Value }; return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops, @@ -5092,24 +5065,25 @@ case Intrinsic::amdgcn_buffer_load: case Intrinsic::amdgcn_buffer_load_format: { SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Op.getOperand(4), // offset - Op.getOperand(5), // glc - Op.getOperand(6) // slc + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // offset + Op.getOperand(5), // glc + Op.getOperand(6) // slc }; - unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? - AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; + unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) + ? AMDGPUISD::BUFFER_LOAD + : AMDGPUISD::BUFFER_LOAD_FORMAT; EVT VT = Op.getValueType(); EVT IntVT = VT.changeTypeToInteger(); auto *M = cast(Op); EVT LoadVT = Op.getValueType(); if (LoadVT.getScalarType() == MVT::f16) - return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, - M, DAG, Ops); + return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, + Ops); return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, M->getMemOperand()); } @@ -5118,21 +5092,21 @@ EVT LoadVT = Op.getValueType(); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Op.getOperand(4), // voffset - Op.getOperand(5), // soffset - Op.getOperand(6), // offset - Op.getOperand(7), // dfmt - Op.getOperand(8), // nfmt - Op.getOperand(9), // glc - Op.getOperand(10) // slc + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // voffset + Op.getOperand(5), // soffset + Op.getOperand(6), // offset + Op.getOperand(7), // dfmt + Op.getOperand(8), // nfmt + Op.getOperand(9), // glc + Op.getOperand(10) // slc }; if (LoadVT.getScalarType() == MVT::f16) - return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, - M, DAG, Ops); + return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG, + Ops); return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, Op->getVTList(), Ops, LoadVT, M->getMemOperand()); @@ -5148,12 +5122,12 @@ case Intrinsic::amdgcn_buffer_atomic_or: case Intrinsic::amdgcn_buffer_atomic_xor: { SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - Op.getOperand(5), // offset - Op.getOperand(6) // slc + Op.getOperand(0), // Chain + Op.getOperand(2), // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + Op.getOperand(5), // offset + Op.getOperand(6) // slc }; EVT VT = Op.getValueType(); @@ -5201,19 +5175,20 @@ case Intrinsic::amdgcn_buffer_atomic_cmpswap: { SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // src - Op.getOperand(3), // cmp - Op.getOperand(4), // rsrc - Op.getOperand(5), // vindex - Op.getOperand(6), // offset - Op.getOperand(7) // slc + Op.getOperand(0), // Chain + Op.getOperand(2), // src + Op.getOperand(3), // cmp + Op.getOperand(4), // rsrc + Op.getOperand(5), // vindex + Op.getOperand(6), // offset + Op.getOperand(7) // slc }; EVT VT = Op.getValueType(); auto *M = cast(Op); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, - Op->getVTList(), Ops, VT, M->getMemOperand()); + Op->getVTList(), Ops, VT, + M->getMemOperand()); } default: @@ -5266,19 +5241,18 @@ const ConstantSDNode *VM = cast(Op.getOperand(9)); const SDValue Ops[] = { - Chain, - DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt - DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en - Op.getOperand(4), // src0 - Op.getOperand(5), // src1 - Op.getOperand(6), // src2 - Op.getOperand(7), // src3 - DAG.getTargetConstant(0, DL, MVT::i1), // compr - DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1) - }; - - unsigned Opc = Done->isNullValue() ? - AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; + Chain, + DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt + DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en + Op.getOperand(4), // src0 + Op.getOperand(5), // src1 + Op.getOperand(6), // src2 + Op.getOperand(7), // src3 + DAG.getTargetConstant(0, DL, MVT::i1), // compr + DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)}; + + unsigned Opc = + Done->isNullValue() ? AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; return DAG.getNode(Opc, DL, Op->getVTList(), Ops); } case Intrinsic::amdgcn_exp_compr: { @@ -5291,29 +5265,28 @@ SDValue Undef = DAG.getUNDEF(MVT::f32); const SDValue Ops[] = { - Chain, - DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt - DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en - DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), - DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), - Undef, // src2 - Undef, // src3 - DAG.getTargetConstant(1, DL, MVT::i1), // compr - DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1) - }; - - unsigned Opc = Done->isNullValue() ? - AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; + Chain, + DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt + DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en + DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), + DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), + Undef, // src2 + Undef, // src3 + DAG.getTargetConstant(1, DL, MVT::i1), // compr + DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)}; + + unsigned Opc = + Done->isNullValue() ? AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; return DAG.getNode(Opc, DL, Op->getVTList(), Ops); } case Intrinsic::amdgcn_s_sendmsg: case Intrinsic::amdgcn_s_sendmsghalt: { - unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ? - AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT; + unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) + ? AMDGPUISD::SENDMSG + : AMDGPUISD::SENDMSGHALT; Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); SDValue Glue = Chain.getValue(1); - return DAG.getNode(NodeOp, DL, MVT::Other, Chain, - Op.getOperand(2), Glue); + return DAG.getNode(NodeOp, DL, MVT::Other, Chain, Op.getOperand(2), Glue); } case Intrinsic::amdgcn_init_exec: { return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain, @@ -5342,7 +5315,8 @@ unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second; if (WGSize <= ST.getWavefrontSize()) return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other, - Op.getOperand(0)), 0); + Op.getOperand(0)), + 0); } return SDValue(); }; @@ -5355,29 +5329,31 @@ SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); - assert(!(OffEn->isOne() && IdxEn->isOne()) && - "Legacy intrinsic doesn't support both offset and index - use new version"); + assert(!(OffEn->isOne() && IdxEn->isOne()) && "Legacy intrinsic doesn't " + "support both offset and " + "index - use new version"); SDValue VIndex = IdxEn->isOne() ? VAddr : Zero; SDValue VOffset = OffEn->isOne() ? VAddr : Zero; // Deal with the vec-3 case const ConstantSDNode *NumChannels = cast(Op.getOperand(4)); - auto Opcode = NumChannels->getZExtValue() == 3 ? - AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT; + auto Opcode = NumChannels->getZExtValue() == 3 + ? AMDGPUISD::TBUFFER_STORE_FORMAT_X3 + : AMDGPUISD::TBUFFER_STORE_FORMAT; SDValue Ops[] = { - Chain, - Op.getOperand(3), // vdata - Op.getOperand(2), // rsrc - VIndex, - VOffset, - Op.getOperand(6), // soffset - Op.getOperand(7), // inst_offset - Op.getOperand(8), // dfmt - Op.getOperand(9), // nfmt - Op.getOperand(12), // glc - Op.getOperand(13), // slc + Chain, + Op.getOperand(3), // vdata + Op.getOperand(2), // rsrc + VIndex, + VOffset, + Op.getOperand(6), // soffset + Op.getOperand(7), // inst_offset + Op.getOperand(8), // dfmt + Op.getOperand(9), // nfmt + Op.getOperand(12), // glc + Op.getOperand(13), // slc }; assert((cast(Op.getOperand(14)))->getZExtValue() == 0 && @@ -5385,11 +5361,8 @@ EVT VT = Op.getOperand(3).getValueType(); MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore, - VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(Opcode, DL, - Op->getVTList(), Ops, VT, MMO); + MachinePointerInfo(), MachineMemOperand::MOStore, VT.getStoreSize(), 4); + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO); } case Intrinsic::amdgcn_tbuffer_store: { @@ -5398,20 +5371,20 @@ if (IsD16) VData = handleD16VData(VData, DAG); SDValue Ops[] = { - Chain, - VData, // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - Op.getOperand(5), // voffset - Op.getOperand(6), // soffset - Op.getOperand(7), // offset - Op.getOperand(8), // dfmt - Op.getOperand(9), // nfmt - Op.getOperand(10), // glc - Op.getOperand(11) // slc + Chain, + VData, // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + Op.getOperand(5), // voffset + Op.getOperand(6), // soffset + Op.getOperand(7), // offset + Op.getOperand(8), // dfmt + Op.getOperand(9), // nfmt + Op.getOperand(10), // glc + Op.getOperand(11) // slc }; - unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : - AMDGPUISD::TBUFFER_STORE_FORMAT; + unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 + : AMDGPUISD::TBUFFER_STORE_FORMAT; MemSDNode *M = cast(Op); return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); @@ -5424,16 +5397,17 @@ if (IsD16) VData = handleD16VData(VData, DAG); SDValue Ops[] = { - Chain, - VData, // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - Op.getOperand(5), // offset - Op.getOperand(6), // glc - Op.getOperand(7) // slc + Chain, + VData, // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + Op.getOperand(5), // offset + Op.getOperand(6), // glc + Op.getOperand(7) // slc }; - unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? - AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; + unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store + ? AMDGPUISD::BUFFER_STORE + : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast(Op); return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, @@ -5449,9 +5423,8 @@ } } -static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, - ISD::LoadExtType ExtType, SDValue Op, - const SDLoc &SL, EVT VT) { +static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, + SDValue Op, const SDLoc &SL, EVT VT) { if (VT.bitsLT(Op.getValueType())) return DAG.getNode(ISD::TRUNCATE, SL, VT, Op); @@ -5469,7 +5442,8 @@ llvm_unreachable("invalid ext type"); } -SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const { +SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, + DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; if (Ld->getAlignment() < 4 || Ld->isDivergent()) return SDValue(); @@ -5496,14 +5470,11 @@ // TODO: Drop only high part of range. SDValue Ptr = Ld->getBasePtr(); - SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, - MVT::i32, SL, Ld->getChain(), Ptr, - Ld->getOffset(), - Ld->getPointerInfo(), MVT::i32, - Ld->getAlignment(), - Ld->getMemOperand()->getFlags(), - Ld->getAAInfo(), - nullptr); // Drop ranges + SDValue NewLoad = DAG.getLoad( + ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr, + Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlignment(), + Ld->getMemOperand()->getFlags(), Ld->getAAInfo(), + nullptr); // Drop ranges EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); if (MemVT.isFloatingPoint()) { @@ -5536,7 +5507,7 @@ // Handle conversion back to floating point if necessary. Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt); - return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL); + return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL); } SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { @@ -5558,13 +5529,11 @@ EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16; - SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, - BasePtr, RealMemVT, MMO); + SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr, + RealMemVT, MMO); - SDValue Ops[] = { - DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), - NewLD.getValue(1) - }; + SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), + NewLD.getValue(1)}; return DAG.getMergeValues(Ops, DL); } @@ -5577,8 +5546,8 @@ unsigned Alignment = Load->getAlignment(); unsigned AS = Load->getAddressSpace(); - if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, - AS, Alignment)) { + if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, AS, + Alignment)) { SDValue Ops[2]; std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); return DAG.getMergeValues(Ops, DL); @@ -5589,8 +5558,8 @@ // If there is a possibilty that flat instruction access scratch memory // then we need to use the same legalization rules we use for private. if (AS == AMDGPUASI.FLAT_ADDRESS) - AS = MFI->hasFlatScratchInit() ? - AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS; + AS = MFI->hasFlatScratchInit() ? AMDGPUASI.PRIVATE_ADDRESS + : AMDGPUASI.GLOBAL_ADDRESS; unsigned NumElements = MemVT.getVectorNumElements(); @@ -5618,8 +5587,7 @@ } if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT || - AS == AMDGPUASI.GLOBAL_ADDRESS || - AS == AMDGPUASI.FLAT_ADDRESS) { + AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorLoad(Op, DAG); // v4 loads are supported for private and global memory. @@ -5692,7 +5660,8 @@ SDValue RHS = Op.getOperand(1); EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal(); + bool Unsafe = + DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal(); if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals()) return SDValue(); @@ -5747,7 +5716,8 @@ SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); switch (Opcode) { - default: llvm_unreachable("no chain equivalent for opcode"); + default: + llvm_unreachable("no chain equivalent for opcode"); case ISD::FMUL: Opcode = AMDGPUISD::FMUL_W_CHAIN; break; @@ -5768,7 +5738,8 @@ SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); switch (Opcode) { - default: llvm_unreachable("no chain equivalent for opcode"); + default: + llvm_unreachable("no chain equivalent for opcode"); case ISD::FMA: Opcode = AMDGPUISD::FMA_W_CHAIN; break; @@ -5793,7 +5764,8 @@ SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1); SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32); - SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag); + SDValue BestQuot = + DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag); return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0); } @@ -5815,7 +5787,7 @@ const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); EVT SetCCVT = - getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); @@ -5844,16 +5816,16 @@ SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); - SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, - RHS, RHS, LHS); - SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, - LHS, RHS, LHS); + SDValue DenominatorScaled = + DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS); + SDValue NumeratorScaled = + DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS); // Denominator is scaled to not be denormal, so using rcp is ok. - SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, - DenominatorScaled); - SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, - DenominatorScaled); + SDValue ApproxRcp = + DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled); + SDValue NegDivScale0 = + DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled); const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | @@ -5863,16 +5835,13 @@ if (!Subtarget->hasFP32Denormals()) { SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, - SL, MVT::i32); - SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, - DAG.getEntryNode(), - EnableDenormValue, BitField); - SDValue Ops[3] = { - NegDivScale0, - EnableDenorm.getValue(0), - EnableDenorm.getValue(1) - }; + const SDValue EnableDenormValue = + DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32); + SDValue EnableDenorm = + DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, DAG.getEntryNode(), + EnableDenormValue, BitField); + SDValue Ops[3] = {NegDivScale0, EnableDenorm.getValue(0), + EnableDenorm.getValue(1)}; NegDivScale0 = DAG.getMergeValues(Ops, SL); } @@ -5883,13 +5852,14 @@ SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp, Fma0); - SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, - Fma1, Fma1); + SDValue Mul = + getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1, Fma1); SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled, Mul); - SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2); + SDValue Fma3 = + getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2); SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled, Fma3); @@ -5897,11 +5867,9 @@ if (!Subtarget->hasFP32Denormals()) { const SDValue DisableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); - SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, - Fma4.getValue(1), - DisableDenormValue, - BitField, - Fma4.getValue(2)); + SDValue DisableDenorm = + DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, Fma4.getValue(1), + DisableDenormValue, BitField, Fma4.getValue(2)); SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, DisableDenorm, DAG.getRoot()); @@ -5909,8 +5877,8 @@ } SDValue Scale = NumeratorScaled.getValue(1); - SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, - Fma4, Fma1, Fma3, Scale); + SDValue Fmas = + DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale); return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); } @@ -5944,8 +5912,8 @@ SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); - SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, - NegDivScale0, Mul, DivScale1); + SDValue Fma4 = + DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1); SDValue Scale; @@ -5961,13 +5929,15 @@ SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); - SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); - SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); + SDValue NumHi = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); + SDValue DenHi = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); - SDValue Scale0Hi - = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); - SDValue Scale1Hi - = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); + SDValue Scale0Hi = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); + SDValue Scale1Hi = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); @@ -5976,8 +5946,8 @@ Scale = DivScale1.getValue(1); } - SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, - Fma4, Fma3, Mul, Scale); + SDValue Fmas = + DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale); return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); } @@ -6003,17 +5973,18 @@ EVT VT = Store->getMemoryVT(); if (VT == MVT::i1) { - return DAG.getTruncStore(Store->getChain(), DL, - DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), - Store->getBasePtr(), MVT::i1, Store->getMemOperand()); + return DAG.getTruncStore( + Store->getChain(), DL, + DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), + Store->getBasePtr(), MVT::i1, Store->getMemOperand()); } assert(VT.isVector() && Store->getValue().getValueType().getScalarType() == MVT::i32); unsigned AS = Store->getAddressSpace(); - if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - AS, Store->getAlignment())) { + if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, AS, + Store->getAlignment())) { return expandUnalignedStore(Store, DAG); } @@ -6022,12 +5993,11 @@ // If there is a possibilty that flat instruction access scratch memory // then we need to use the same legalization rules we use for private. if (AS == AMDGPUASI.FLAT_ADDRESS) - AS = MFI->hasFlatScratchInit() ? - AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS; + AS = MFI->hasFlatScratchInit() ? AMDGPUASI.PRIVATE_ADDRESS + : AMDGPUASI.GLOBAL_ADDRESS; unsigned NumElements = VT.getVectorNumElements(); - if (AS == AMDGPUASI.GLOBAL_ADDRESS || - AS == AMDGPUASI.FLAT_ADDRESS) { + if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorStore(Op, DAG); return SDValue(); @@ -6065,10 +6035,10 @@ EVT VT = Op.getValueType(); SDValue Arg = Op.getOperand(0); // TODO: Should this propagate fast-math-flags? - SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, - DAG.getNode(ISD::FMUL, DL, VT, Arg, - DAG.getConstantFP(0.5/M_PI, DL, - VT))); + SDValue FractPart = + DAG.getNode(AMDGPUISD::FRACT, DL, VT, + DAG.getNode(ISD::FMUL, DL, VT, Arg, + DAG.getConstantFP(0.5 / M_PI, DL, VT))); switch (Op.getOpcode()) { case ISD::FCOS: @@ -6080,7 +6050,8 @@ } } -SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const { +SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, + SelectionDAG &DAG) const { AtomicSDNode *AtomicNode = cast(Op); assert(AtomicNode->isCompareAndSwap()); unsigned AS = AtomicNode->getAddressSpace(); @@ -6101,18 +6072,20 @@ MVT VecType = MVT::getVectorVT(SimpleVT, 2); SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old}); - SDValue Ops[] = { ChainIn, Addr, NewOld }; + SDValue Ops[] = {ChainIn, Addr, NewOld}; - return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(), - Ops, VT, AtomicNode->getMemOperand()); + return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, + Op->getVTList(), Ops, VT, + AtomicNode->getMemOperand()); } //===----------------------------------------------------------------------===// // Custom DAG optimizations //===----------------------------------------------------------------------===// -SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, - DAGCombinerInfo &DCI) const { +SDValue +SITargetLowering::performUCharToFloatCombine(SDNode *N, + DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); EVT ScalarVT = VT.getScalarType(); if (ScalarVT != MVT::f32) @@ -6153,8 +6126,7 @@ // operand with the add of new constant offset. This eliminates one of the uses, // and may allow the remaining use to also be simplified. // -SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, - unsigned AddrSpace, +SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace, EVT MemVT, DAGCombinerInfo &DCI) const { SDValue N0 = N->getOperand(0); @@ -6193,9 +6165,9 @@ SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32); SDNodeFlags Flags; - Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() && - (N0.getOpcode() == ISD::OR || - N0->getFlags().hasNoUnsignedWrap())); + Flags.setNoUnsignedWrap( + N->getFlags().hasNoUnsignedWrap() && + (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap())); return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags); } @@ -6208,7 +6180,7 @@ // TODO: We could also do this for multiplies. if (Ptr.getOpcode() == ISD::SHL) { - SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(), + SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(), N->getMemoryVT(), DCI); if (NewPtr) { SmallVector NewOps(N->op_begin(), N->op_end()); @@ -6233,18 +6205,16 @@ // this way. TODO: We won't want this for SALU especially if it is an inline // immediate. SDValue SITargetLowering::splitBinaryBitConstantOp( - DAGCombinerInfo &DCI, - const SDLoc &SL, - unsigned Opc, SDValue LHS, - const ConstantSDNode *CRHS) const { + DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, + const ConstantSDNode *CRHS) const { uint64_t Val = CRHS->getZExtValue(); uint32_t ValLo = Lo_32(Val); uint32_t ValHi = Hi_32(Val); const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - if ((bitOpWithConstantIsReducible(Opc, ValLo) || - bitOpWithConstantIsReducible(Opc, ValHi)) || - (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) { + if ((bitOpWithConstantIsReducible(Opc, ValLo) || + bitOpWithConstantIsReducible(Opc, ValHi)) || + (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) { // If we need to materialize a 64-bit immediate, it will be split up later // anyway. Avoid creating the harder to understand 64-bit immediate // materialization. @@ -6260,7 +6230,8 @@ if (V.getValueType() != MVT::i1) return false; switch (V.getOpcode()) { - default: break; + default: + break; case ISD::SETCC: case ISD::AND: case ISD::OR: @@ -6276,10 +6247,14 @@ static uint32_t getConstantPermuteMask(uint32_t C) { // 0xff for any zero byte in the mask uint32_t ZeroByteMask = 0; - if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff; - if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00; - if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000; - if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000; + if (!(C & 0x000000ff)) + ZeroByteMask |= 0x000000ff; + if (!(C & 0x0000ff00)) + ZeroByteMask |= 0x0000ff00; + if (!(C & 0x00ff0000)) + ZeroByteMask |= 0x00ff0000; + if (!(C & 0xff000000)) + ZeroByteMask |= 0xff000000; uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte if ((NonZeroByteMask & C) != NonZeroByteMask) return 0; // Partial bytes selected. @@ -6346,11 +6321,10 @@ SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - const ConstantSDNode *CRHS = dyn_cast(RHS); if (VT == MVT::i64 && CRHS) { - if (SDValue Split - = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) + if (SDValue Split = + splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) return Split; } @@ -6369,10 +6343,10 @@ unsigned Offset = NB + Shift; if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary. SDLoc SL(N); - SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, - LHS->getOperand(0), - DAG.getConstant(Offset, SL, MVT::i32), - DAG.getConstant(Bits, SL, MVT::i32)); + SDValue BFE = + DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0), + DAG.getConstant(Offset, SL, MVT::i32), + DAG.getConstant(Bits, SL, MVT::i32)); EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits); SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE, DAG.getValueType(NarrowVT)); @@ -6414,26 +6388,25 @@ return SDValue(); if (RCC == ISD::SETUNE) { - const ConstantFPSDNode *C1 = dyn_cast(RHS.getOperand(1)); + const ConstantFPSDNode *C1 = + dyn_cast(RHS.getOperand(1)); if (!C1 || !C1->isInfinity() || C1->isNegative()) return SDValue(); const uint32_t Mask = SIInstrFlags::N_NORMAL | - SIInstrFlags::N_SUBNORMAL | - SIInstrFlags::N_ZERO | - SIInstrFlags::P_ZERO | - SIInstrFlags::P_SUBNORMAL | + SIInstrFlags::N_SUBNORMAL | SIInstrFlags::N_ZERO | + SIInstrFlags::P_ZERO | SIInstrFlags::P_SUBNORMAL | SIInstrFlags::P_NORMAL; - static_assert(((~(SIInstrFlags::S_NAN | - SIInstrFlags::Q_NAN | - SIInstrFlags::N_INFINITY | - SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, - "mask not equal"); + static_assert( + ((~(SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN | + SIInstrFlags::N_INFINITY | SIInstrFlags::P_INFINITY)) & + 0x3ff) == Mask, + "mask not equal"); SDLoc DL(N); - return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, - X, DAG.getConstant(Mask, DL, MVT::i32)); + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X, + DAG.getConstant(Mask, DL, MVT::i32)); } } } @@ -6444,16 +6417,17 @@ if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS && RHS.hasOneUse()) { ISD::CondCode LCC = cast(LHS.getOperand(2))->get(); - // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan) - // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan) + // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | + // n_nan) + // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | + // n_nan) const ConstantSDNode *Mask = dyn_cast(RHS.getOperand(1)); if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask && (RHS.getOperand(0) == LHS.getOperand(0) && LHS.getOperand(0) == LHS.getOperand(1))) { const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN; - unsigned NewMask = LCC == ISD::SETO ? - Mask->getZExtValue() & ~OrdMask : - Mask->getZExtValue() & OrdMask; + unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask + : Mask->getZExtValue() & OrdMask; SDLoc DL(N); return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0), @@ -6461,14 +6435,14 @@ } } - if (VT == MVT::i32 && - (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) { + if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND || + LHS.getOpcode() == ISD::SIGN_EXTEND)) { // and x, (sext cc from i1) => select cc, x, 0 if (RHS.getOpcode() != ISD::SIGN_EXTEND) std::swap(LHS, RHS); if (isBoolSGPR(RHS.getOperand(0))) - return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), - LHS, DAG.getConstant(0, SDLoc(N), MVT::i32)); + return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS, + DAG.getConstant(0, SDLoc(N), MVT::i32)); } // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) @@ -6512,8 +6486,8 @@ uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404); SDLoc DL(N); - return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, - LHS.getOperand(0), RHS.getOperand(0), + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), + RHS.getOperand(0), DAG.getConstant(Sel, DL, MVT::i32)); } } @@ -6545,10 +6519,11 @@ // Only 10 bits are used. static const uint32_t MaxMask = 0x3ff; - uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; + uint32_t NewMask = + (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; SDLoc DL(N); - return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, - Src, DAG.getConstant(NewMask, DL, MVT::i32)); + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src, + DAG.getConstant(NewMask, DL, MVT::i32)); } return SDValue(); @@ -6601,8 +6576,8 @@ uint32_t Sel = LHSMask | RHSMask; SDLoc DL(N); - return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, - LHS.getOperand(0), RHS.getOperand(0), + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), + RHS.getOperand(0), DAG.getConstant(Sel, DL, MVT::i32)); } } @@ -6632,16 +6607,16 @@ DCI.AddToWorklist(LowOr.getNode()); DCI.AddToWorklist(HiBits.getNode()); - SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, - LowOr, HiBits); + SDValue Vec = + DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits); return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); } } const ConstantSDNode *CRHS = dyn_cast(N->getOperand(1)); if (CRHS) { - if (SDValue Split - = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS)) + if (SDValue Split = + splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS)) return Split; } @@ -6659,8 +6634,8 @@ const ConstantSDNode *CRHS = dyn_cast(RHS); if (CRHS) { - if (SDValue Split - = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS)) + if (SDValue Split = + splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS)) return Split; } @@ -6684,8 +6659,8 @@ case ISD::UINT_TO_FP: case ISD::SINT_TO_FP: case ISD::FABS: - // Fabs is lowered to a bit operation, but it's an and which will clear the - // high bits anyway. + // Fabs is lowered to a bit operation, but it's an and which will clear the + // high bits anyway. case ISD::FSQRT: case ISD::FSIN: case ISD::FCOS: @@ -6909,8 +6884,8 @@ return false; case ISD::INTRINSIC_WO_CHAIN: { - unsigned IntrinsicID - = cast(Op.getOperand(0))->getZExtValue(); + unsigned IntrinsicID = + cast(Op.getOperand(0))->getZExtValue(); // TODO: Handle more intrinsics switch (IntrinsicID) { case Intrinsic::amdgcn_cvt_pkrtz: @@ -6934,8 +6909,9 @@ // Constant fold canonicalize. -SDValue SITargetLowering::getCanonicalConstantFP( - SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const { +SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG, + const SDLoc &SL, EVT VT, + const APFloat &C) const { // Flush denormals to 0 if not enabled. if (C.isDenormal() && !denormalsEnabledForType(VT)) return DAG.getConstantFP(0.0, SL, VT); @@ -6964,9 +6940,9 @@ return Op.isUndef() || isa(Op); } -SDValue SITargetLowering::performFCanonicalizeCombine( - SDNode *N, - DAGCombinerInfo &DCI) const { +SDValue +SITargetLowering::performFCanonicalizeCombine(SDNode *N, + DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -7001,8 +6977,8 @@ for (unsigned I = 0; I != 2; ++I) { SDValue Op = N0.getOperand(I); if (ConstantFPSDNode *CFP = dyn_cast(Op)) { - NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT, - CFP->getValueAPF()); + NewElts[I] = + getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF()); } else if (Op.isUndef()) { // Handled below based on what the other operand is. NewElts[I] = Op; @@ -7016,13 +6992,15 @@ // cheaper to use and may be free with a packed operation. if (NewElts[0].isUndef()) { if (isa(NewElts[1])) - NewElts[0] = isa(NewElts[1]) ? - NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT); + NewElts[0] = isa(NewElts[1]) + ? NewElts[1] + : DAG.getConstantFP(0.0f, SL, EltVT); } if (NewElts[1].isUndef()) { - NewElts[1] = isa(NewElts[0]) ? - NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT); + NewElts[1] = isa(NewElts[0]) + ? NewElts[0] + : DAG.getConstantFP(0.0f, SL, EltVT); } return DAG.getBuildVector(VT, SL, NewElts); @@ -7051,9 +7029,10 @@ } } -SDValue SITargetLowering::performIntMed3ImmCombine( - SelectionDAG &DAG, const SDLoc &SL, - SDValue Op0, SDValue Op1, bool Signed) const { +SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG, + const SDLoc &SL, SDValue Op0, + SDValue Op1, + bool Signed) const { ConstantSDNode *K1 = dyn_cast(Op1); if (!K1) return SDValue(); @@ -7073,8 +7052,8 @@ EVT VT = K0->getValueType(0); unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3; if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) { - return DAG.getNode(Med3Opc, SL, VT, - Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); + return DAG.getNode(Med3Opc, SL, VT, Op0.getOperand(0), SDValue(K0, 0), + SDValue(K1, 0)); } // If there isn't a 16-bit med3 operation, convert to 32-bit. @@ -7102,8 +7081,7 @@ } SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, - const SDLoc &SL, - SDValue Op0, + const SDLoc &SL, SDValue Op0, SDValue Op1) const { ConstantFPSDNode *K1 = getSplatConstantFP(Op1); if (!K1) @@ -7138,8 +7116,8 @@ if (!DAG.isKnownNeverSNaN(Var)) return SDValue(); - return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), - Var, SDValue(K0, 0), SDValue(K1, 0)); + return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var, + SDValue(K0, 0), SDValue(K1, 0)); } return SDValue(); @@ -7157,7 +7135,6 @@ // Only do this if the inner op has one use since this will just increases // register pressure for no benefit. - if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY && !VT.isVector() && VT != MVT::f64 && ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) { @@ -7165,12 +7142,8 @@ // min(min(a, b), c) -> min3(a, b, c) if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { SDLoc DL(N); - return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), - DL, - N->getValueType(0), - Op0.getOperand(0), - Op0.getOperand(1), - Op1); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0), + Op0.getOperand(0), Op0.getOperand(1), Op1); } // Try commuted. @@ -7178,12 +7151,8 @@ // min(a, min(b, c)) -> min3(a, b, c) if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { SDLoc DL(N); - return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), - DL, - N->getValueType(0), - Op0, - Op1.getOperand(0), - Op1.getOperand(1)); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0), + Op0, Op1.getOperand(0), Op1.getOperand(1)); } } @@ -7276,21 +7245,22 @@ return SDValue(); } -SDValue SITargetLowering::performExtractVectorEltCombine( - SDNode *N, DAGCombinerInfo &DCI) const { +SDValue +SITargetLowering::performExtractVectorEltCombine(SDNode *N, + DAGCombinerInfo &DCI) const { SDValue Vec = N->getOperand(0); SelectionDAG &DAG = DCI.DAG; EVT VecVT = Vec.getValueType(); EVT EltVT = VecVT.getVectorElementType(); - if ((Vec.getOpcode() == ISD::FNEG || - Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) { + if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) && + allUsesHaveSourceMods(N)) { SDLoc SL(N); EVT EltVT = N->getValueType(0); SDValue Idx = N->getOperand(1); - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, - Vec.getOperand(0), Idx); + SDValue Elt = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec.getOperand(0), Idx); return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt); } @@ -7305,10 +7275,10 @@ SDValue Idx = N->getOperand(1); unsigned Opc = Vec.getOpcode(); - switch(Opc) { + switch (Opc) { default: return SDValue(); - // TODO: Support other binary operations. + // TODO: Support other binary operations. case ISD::FADD: case ISD::ADD: case ISD::UMIN: @@ -7335,11 +7305,8 @@ // elements. This exposes more load reduction opportunities by replacing // multiple small extract_vector_elements with a single 32-bit extract. auto *Idx = dyn_cast(N->getOperand(1)); - if (EltSize <= 16 && - EltVT.isByteSized() && - VecSize > 32 && - VecSize % 32 == 0 && - Idx) { + if (EltSize <= 16 && EltVT.isByteSized() && VecSize > 32 && + VecSize % 32 == 0 && Idx) { EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT); unsigned BitIndex = Idx->getZExtValue() * EltSize; @@ -7357,7 +7324,8 @@ DAG.getConstant(LeftoverBitIdx, SL, MVT::i32)); DCI.AddToWorklist(Srl.getNode()); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl); + SDValue Trunc = + DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl); DCI.AddToWorklist(Trunc.getNode()); return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc); } @@ -7365,8 +7333,8 @@ return SDValue(); } -static bool convertBuildVectorCastElt(SelectionDAG &DAG, - SDValue &Lo, SDValue &Hi) { +static bool convertBuildVectorCastElt(SelectionDAG &DAG, SDValue &Lo, + SDValue &Hi) { if (Hi.getOpcode() == ISD::BITCAST && Hi.getOperand(0).getValueType() == MVT::f16 && (isa(Lo) || Lo.isUndef())) { @@ -7378,8 +7346,9 @@ return false; } -SDValue SITargetLowering::performBuildVectorCombine( - SDNode *N, DAGCombinerInfo &DCI) const { +SDValue +SITargetLowering::performBuildVectorCombine(SDNode *N, + DAGCombinerInfo &DCI) const { SDLoc SL(N); if (!isTypeLegal(MVT::v2i16)) @@ -7394,12 +7363,12 @@ // v2i16 build_vector (const|undef), (bitcast f16:$x) // -> bitcast (v2f16 build_vector const|undef, $x if (convertBuildVectorCastElt(DAG, Lo, Hi)) { - SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi }); + SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, {Lo, Hi}); return DAG.getNode(ISD::BITCAST, SL, VT, NewVec); } if (convertBuildVectorCastElt(DAG, Hi, Lo)) { - SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo }); + SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, {Hi, Lo}); return DAG.getNode(ISD::BITCAST, SL, VT, NewVec); } } @@ -7429,10 +7398,8 @@ return 0; } -static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, - EVT VT, - SDValue N0, SDValue N1, SDValue N2, - bool Signed) { +static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, + SDValue N0, SDValue N1, SDValue N2, bool Signed) { unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32; SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1); SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2); @@ -7447,10 +7414,9 @@ SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) - && Subtarget->hasMad64_32() && - !VT.isVector() && VT.getScalarSizeInBits() > 32 && - VT.getScalarSizeInBits() <= 64) { + if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) && + Subtarget->hasMad64_32() && !VT.isVector() && + VT.getScalarSizeInBits() > 32 && VT.getScalarSizeInBits() <= 64) { if (LHS.getOpcode() != ISD::MUL) std::swap(LHS, RHS); @@ -7489,7 +7455,8 @@ Opc = RHS.getOpcode(); switch (Opc) { - default: break; + default: + break; case ISD::ZERO_EXTEND: case ISD::SIGN_EXTEND: case ISD::ANY_EXTEND: { @@ -7497,15 +7464,16 @@ if (!isBoolSGPR(Cond)) break; SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); - SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond }; + SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond}; Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY; return DAG.getNode(Opc, SL, VTList, Args); } case ISD::ADDCARRY: { // add x, (addcarry y, 0, cc) => addcarry x, y, cc auto C = dyn_cast(RHS.getOperand(1)); - if (!C || C->getZExtValue() != 0) break; - SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) }; + if (!C || C->getZExtValue() != 0) + break; + SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)}; return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args); } } @@ -7533,14 +7501,15 @@ auto C = dyn_cast(LHS.getOperand(1)); if (!C || C->getZExtValue() != 0) return SDValue(); - SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) }; + SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)}; return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args); } return SDValue(); } -SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N, - DAGCombinerInfo &DCI) const { +SDValue +SITargetLowering::performAddCarrySubCarryCombine(SDNode *N, + DAGCombinerInfo &DCI) const { if (N->getValueType(0) != MVT::i32) return SDValue(); @@ -7558,7 +7527,7 @@ unsigned Opc = N->getOpcode(); if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) || (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) { - SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) }; + SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)}; return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args); } return SDValue(); @@ -7628,7 +7597,7 @@ SDValue A = LHS.getOperand(0); if (A == LHS.getOperand(1)) { unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode()); - if (FusedOp != 0){ + if (FusedOp != 0) { const SDValue Two = DAG.getConstantFP(2.0, SL, VT); SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); @@ -7643,7 +7612,7 @@ SDValue A = RHS.getOperand(0); if (A == RHS.getOperand(1)) { unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode()); - if (FusedOp != 0){ + if (FusedOp != 0) { const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT); return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS); } @@ -7668,13 +7637,13 @@ SDValue Op2 = N->getOperand(1); SDValue FMA = N->getOperand(2); - if (FMA.getOpcode() != ISD::FMA || - Op1.getOpcode() != ISD::FP_EXTEND || + if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND || Op2.getOpcode() != ISD::FP_EXTEND) return SDValue(); // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero, - // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract + // regardless of the denorm mode setting. Therefore, + // unsafe-fp-math/fp-contract // is sufficient to allow generaing fdot2. const TargetOptions &Options = DAG.getTarget().Options; if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || @@ -7719,8 +7688,7 @@ if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16) return SDValue(); - if ((Vec1 == Vec3 && Vec2 == Vec4) || - (Vec1 == Vec4 && Vec2 == Vec3)) { + if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) { return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc, DAG.getTargetConstant(0, SL, MVT::i1)); } @@ -7792,29 +7760,28 @@ } } - if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() && - VT != MVT::f16)) + if (VT != MVT::f32 && VT != MVT::f64 && + (Subtarget->has16BitInsts() && VT != MVT::f16)) return SDValue(); // Match isinf/isfinite pattern // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) // (fcmp one (fabs x), inf) -> (fp_class x, // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero) - if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) { + if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && + LHS.getOpcode() == ISD::FABS) { const ConstantFPSDNode *CRHS = dyn_cast(RHS); if (!CRHS) return SDValue(); const APFloat &APF = CRHS->getValueAPF(); if (APF.isInfinity() && !APF.isNegative()) { - const unsigned IsInfMask = SIInstrFlags::P_INFINITY | - SIInstrFlags::N_INFINITY; - const unsigned IsFiniteMask = SIInstrFlags::N_ZERO | - SIInstrFlags::P_ZERO | - SIInstrFlags::N_NORMAL | - SIInstrFlags::P_NORMAL | - SIInstrFlags::N_SUBNORMAL | - SIInstrFlags::P_SUBNORMAL; + const unsigned IsInfMask = + SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; + const unsigned IsFiniteMask = + SIInstrFlags::N_ZERO | SIInstrFlags::P_ZERO | SIInstrFlags::N_NORMAL | + SIInstrFlags::P_NORMAL | SIInstrFlags::N_SUBNORMAL | + SIInstrFlags::P_SUBNORMAL; unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask; return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), DAG.getConstant(Mask, SL, MVT::i32)); @@ -7824,8 +7791,9 @@ return SDValue(); } -SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N, - DAGCombinerInfo &DCI) const { +SDValue +SITargetLowering::performCvtF32UByteNCombine(SDNode *N, + DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc SL(N); unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; @@ -7841,8 +7809,7 @@ // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x - if (const ConstantSDNode *C = - dyn_cast(Srl.getOperand(1))) { + if (const ConstantSDNode *C = dyn_cast(Srl.getOperand(1))) { Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)), EVT(MVT::i32)); @@ -7890,7 +7857,6 @@ return SDValue(CSrc, 0); } - SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { @@ -7949,7 +7915,7 @@ case AMDGPUISD::ATOMIC_DEC: case AMDGPUISD::ATOMIC_LOAD_FADD: case AMDGPUISD::ATOMIC_LOAD_FMIN: - case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics. + case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics. if (DCI.isBeforeLegalize()) break; return performMemSDNodeCombine(cast(N), DCI); @@ -8022,11 +7988,16 @@ /// Helper function for adjustWritemask static unsigned SubIdx2Lane(unsigned Idx) { switch (Idx) { - default: return 0; - case AMDGPU::sub0: return 0; - case AMDGPU::sub1: return 1; - case AMDGPU::sub2: return 2; - case AMDGPU::sub3: return 3; + default: + return 0; + case AMDGPU::sub0: + return 0; + case AMDGPU::sub1: + return 1; + case AMDGPU::sub2: + return 2; + case AMDGPU::sub3: + return 3; } } @@ -8040,9 +8011,10 @@ if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx)) return Node; // not implemented for D16 - SDNode *Users[4] = { nullptr }; + SDNode *Users[4] = {nullptr}; unsigned Lane = 0; - unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1; + unsigned DmaskIdx = + AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1; unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); unsigned NewDmask = 0; bool HasChain = Node->getNumValues() > 1; @@ -8053,8 +8025,8 @@ } // Try to figure out the used register components - for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); - I != E; ++I) { + for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); I != E; + ++I) { // Don't look at users of the chain. if (I.getUse().getResNo() != 0) @@ -8105,14 +8077,13 @@ MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT(); - MVT ResultVT = BitsSet == 1 ? - SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet); - SDVTList NewVTList = HasChain ? - DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT); - + MVT ResultVT = + BitsSet == 1 ? SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet); + SDVTList NewVTList = + HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT); - MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node), - NewVTList, Ops); + MachineSDNode *NewNode = + DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops); if (HasChain) { // Update chain. @@ -8122,9 +8093,9 @@ if (BitsSet == 1) { assert(Node->hasNUsesOfValue(1, 0)); - SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY, - SDLoc(Node), Users[Lane]->getValueType(0), - SDValue(NewNode, 0)); + SDNode *Copy = + DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node), + Users[Lane]->getValueType(0), SDValue(NewNode, 0)); DAG.ReplaceAllUsesWith(Users[Lane], Copy); return nullptr; } @@ -8139,10 +8110,17 @@ DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); switch (Idx) { - default: break; - case AMDGPU::sub0: Idx = AMDGPU::sub1; break; - case AMDGPU::sub1: Idx = AMDGPU::sub2; break; - case AMDGPU::sub2: Idx = AMDGPU::sub3; break; + default: + break; + case AMDGPU::sub0: + Idx = AMDGPU::sub1; + break; + case AMDGPU::sub1: + Idx = AMDGPU::sub2; + break; + case AMDGPU::sub2: + Idx = AMDGPU::sub3; + break; } } @@ -8160,8 +8138,9 @@ /// Legalize target independent instructions (e.g. INSERT_SUBREG) /// with frame index operands. /// LLVM assumes that inputs are to these instructions are registers. -SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, - SelectionDAG &DAG) const { +SDNode * +SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, + SelectionDAG &DAG) const { if (Node->getOpcode() == ISD::CopyToReg) { RegisterSDNode *DestReg = cast(Node->getOperand(1)); SDValue SrcVal = Node->getOperand(2); @@ -8173,15 +8152,14 @@ SDLoc SL(Node); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); SDValue VReg = DAG.getRegister( - MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1); + MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1); SDNode *Glued = Node->getGluedNode(); - SDValue ToVReg - = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal, - SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0)); - SDValue ToResultReg - = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0), - VReg, ToVReg.getValue(1)); + SDValue ToVReg = DAG.getCopyToReg( + Node->getOperand(0), SL, VReg, SrcVal, + SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0)); + SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0), + VReg, ToVReg.getValue(1)); DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode()); DAG.RemoveDeadNode(Node); return ToResultReg.getNode(); @@ -8197,8 +8175,9 @@ SDLoc DL(Node); Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, - Node->getOperand(i).getValueType(), - Node->getOperand(i)), 0)); + Node->getOperand(i).getValueType(), + Node->getOperand(i)), + 0)); } return DAG.UpdateNodeOperands(Node, Ops); @@ -8216,8 +8195,7 @@ return adjustWritemask(Node, DAG); } - if (Opcode == AMDGPU::INSERT_SUBREG || - Opcode == AMDGPU::REG_SEQUENCE) { + if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) { legalizeTargetIndependentNode(Node, DAG); return Node; } @@ -8243,8 +8221,8 @@ MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT); - SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), - UndefReg, Src0, SDValue()); + SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg, + Src0, SDValue()); // src0 must be the same register as src1 or src2, even if the value is // undefined, so make sure we don't violate this constraint. @@ -8264,7 +8242,7 @@ } else break; - SmallVector Ops = { Src0, Src1, Src2 }; + SmallVector Ops = {Src0, Src1, Src2}; for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I) Ops.push_back(Node->getOperand(I)); @@ -8307,8 +8285,7 @@ // This means these instructions always have a use, so we need to add a // special case to check if the atomic has only one extract_subreg use, // which itself has no uses. - if ((Node->hasNUsesOfValue(1, 0) && - Node->use_begin()->isMachineOpcode() && + if ((Node->hasNUsesOfValue(1, 0) && Node->use_begin()->isMachineOpcode() && Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG && !Node->use_begin()->hasAnyUseOfValue(0))) { unsigned Def = MI.getOperand(0).getReg(); @@ -8343,24 +8320,20 @@ // full 128-bit register. If we are building multiple resource descriptors, // this will allow CSEing of the 2-component register. const SDValue Ops0[] = { - DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), - buildSMovImm32(DAG, DL, 0), - DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), - DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) - }; + DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), + buildSMovImm32(DAG, DL, 0), + DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), + DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)}; - SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::v2i32, Ops0), 0); + SDValue SubRegHi = SDValue( + DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0); // Combine the constants and the pointer. const SDValue Ops1[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), - Ptr, - DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), - SubRegHi, - DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) - }; + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), Ptr, + DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi, + DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)}; return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); } @@ -8375,26 +8348,26 @@ SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); if (RsrcDword1) { - PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, - DAG.getConstant(RsrcDword1, DL, MVT::i32)), - 0); + PtrHi = + SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, + DAG.getConstant(RsrcDword1, DL, MVT::i32)), + 0); } - SDValue DataLo = buildSMovImm32(DAG, DL, - RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); + SDValue DataLo = + buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); const SDValue Ops[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), - PtrLo, - DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - PtrHi, - DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), - DataLo, - DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), - DataHi, - DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32) - }; + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), + PtrLo, + DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + PtrHi, + DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), + DataLo, + DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), + DataHi, + DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)}; return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); } @@ -8489,7 +8462,8 @@ SITargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { - default: break; + default: + break; case 's': case 'v': return C_RegisterClass; @@ -8516,9 +8490,8 @@ // during lowering. Calls are only detected after the function is // lowered. We're about to reserve registers, so don't bother using it if we // aren't really going to use it. - bool NeedSP = !Info->isEntryFunction() || - MFI.hasVarSizedObjects() || - MFI.hasCalls(); + bool NeedSP = + !Info->isEntryFunction() || MFI.hasVarSizedObjects() || MFI.hasCalls(); if (NeedSP) { unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF); @@ -8545,8 +8518,8 @@ const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { - TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts, - DAG, Depth); + TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts, DAG, + Depth); if (getSubtarget()->enableHugePrivateBuffer()) return; @@ -8558,68 +8531,61 @@ Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits); } -bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, - FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const -{ +bool SITargetLowering::isSDNodeSourceOfDivergence( + const SDNode *N, FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const { switch (N->getOpcode()) { - case ISD::Register: - case ISD::CopyFromReg: - { - const RegisterSDNode *R = nullptr; - if (N->getOpcode() == ISD::Register) { - R = dyn_cast(N); - } - else { - R = dyn_cast(N->getOperand(1)); - } - if (R) - { - const MachineFunction * MF = FLI->MF; - const GCNSubtarget &ST = MF->getSubtarget(); - const MachineRegisterInfo &MRI = MF->getRegInfo(); - const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo(); - unsigned Reg = R->getReg(); - if (TRI.isPhysicalRegister(Reg)) - return TRI.isVGPR(MRI, Reg); - - if (MRI.isLiveIn(Reg)) { - // workitem.id.x workitem.id.y workitem.id.z - // Any VGPR formal argument is also considered divergent - if (TRI.isVGPR(MRI, Reg)) - return true; - // Formal arguments of non-entry functions - // are conservatively considered divergent - else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv())) - return true; - } - return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg)); + case ISD::Register: + case ISD::CopyFromReg: { + const RegisterSDNode *R = nullptr; + if (N->getOpcode() == ISD::Register) { + R = dyn_cast(N); + } else { + R = dyn_cast(N->getOperand(1)); + } + if (R) { + const MachineFunction *MF = FLI->MF; + const GCNSubtarget &ST = MF->getSubtarget(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo(); + unsigned Reg = R->getReg(); + if (TRI.isPhysicalRegister(Reg)) + return TRI.isVGPR(MRI, Reg); + + if (MRI.isLiveIn(Reg)) { + // workitem.id.x workitem.id.y workitem.id.z + // Any VGPR formal argument is also considered divergent + if (TRI.isVGPR(MRI, Reg)) + return true; + // Formal arguments of non-entry functions + // are conservatively considered divergent + else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv())) + return true; } + return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg)); } - break; - case ISD::LOAD: { - const LoadSDNode *L = dyn_cast(N); - if (L->getMemOperand()->getAddrSpace() == - Subtarget->getAMDGPUAS().PRIVATE_ADDRESS) - return true; - } break; - case ISD::CALLSEQ_END: + } break; + case ISD::LOAD: { + const LoadSDNode *L = dyn_cast(N); + if (L->getMemOperand()->getAddrSpace() == + Subtarget->getAMDGPUAS().PRIVATE_ADDRESS) + return true; + } break; + case ISD::CALLSEQ_END: return true; break; - case ISD::INTRINSIC_WO_CHAIN: - { - - } - return AMDGPU::isIntrinsicSourceOfDivergence( - cast(N->getOperand(0))->getZExtValue()); - case ISD::INTRINSIC_W_CHAIN: - return AMDGPU::isIntrinsicSourceOfDivergence( - cast(N->getOperand(1))->getZExtValue()); - // In some cases intrinsics that are a source of divergence have been - // lowered to AMDGPUISD so we also need to check those too. - case AMDGPUISD::INTERP_MOV: - case AMDGPUISD::INTERP_P1: - case AMDGPUISD::INTERP_P2: - return true; + case ISD::INTRINSIC_WO_CHAIN: { + } + return AMDGPU::isIntrinsicSourceOfDivergence( + cast(N->getOperand(0))->getZExtValue()); + case ISD::INTRINSIC_W_CHAIN: + return AMDGPU::isIntrinsicSourceOfDivergence( + cast(N->getOperand(1))->getZExtValue()); + // In some cases intrinsics that are a source of divergence have been + // lowered to AMDGPUISD so we also need to check those too. + case AMDGPUISD::INTERP_MOV: + case AMDGPUISD::INTERP_P1: + case AMDGPUISD::INTERP_P2: + return true; } return false; } Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -746,7 +746,7 @@ def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>; def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>; def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>; -def R128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>; +def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>; def D16 : NamedOperandBit<"D16", NamedMatchClass<"D16">>; def LWE : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>; def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>; Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll @@ -0,0 +1,136 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s + +; GCN-LABEL: {{^}}gather4_2d: +; GCN: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16{{$}} +define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_cube: +; GCN: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da{{$}} +define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f16(i32 1, half %s, half %t, half %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_2darray: +; GCN: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da{{$}} +define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f16(i32 1, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_2d: +; GCN: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16{{$}} +define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_cl_2d: +; GCN: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16{{$}} +define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f16(i32 1, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_cl_2d: +; GCN: image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 a16{{$}} +define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_b_2d: +; GCN: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16{{$}} +define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f16(i32 1, float %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_b_2d: +; GCN: image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 a16{{$}} +define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f16(i32 1, float %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_b_cl_2d: +; GCN: image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 a16{{$}} +define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f16(i32 1, float %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_b_cl_2d: +; GCN: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 a16{{$}} +define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f16(i32 1, float %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_l_2d: +; GCN: image_gather4_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16{{$}} +define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f16(i32 1, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_l_2d: +; GCN: image_gather4_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 a16{{$}} +define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_lz_2d: +; GCN: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16{{$}} +define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_lz_2d: +; GCN: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16{{$}} +define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f16(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f16(i32, float, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f16(i32, float, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -0,0 +1,435 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s +; GCN: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_2d: +; GCN: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_3d: +; GCN: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %r) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cube: +; GCN: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 da{{$}} +define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half %s, half %t, half %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_1darray: +; GCN: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 da{{$}} +define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %slice) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half %s, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_2darray: +; GCN: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 da{{$}} +define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_1d: +; GCN: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16(i32 15, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_2d: +; GCN: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cl_1d: +; GCN: image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16(i32 15, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cl_2d: +; GCN: image_sample_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16(i32 15, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_cl_1d: +; GCN: image_sample_c_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32 15, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_cl_2d: +; GCN: image_sample_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_b_1d: +; GCN: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f16(i32 15, float %bias, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_b_2d: +; GCN: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f16(i32 15, float %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_b_1d: +; GCN: image_sample_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_b_2d: +; GCN: image_sample_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_b_cl_1d: +; GCN: image_sample_b_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f16(i32 15, float %bias, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_b_cl_2d: +; GCN: image_sample_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f16(i32 15, float %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_b_cl_1d: +; GCN: image_sample_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_b_cl_2d: +; GCN: image_sample_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_d_1d: +; GCN: image_sample_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_d_2d: +; GCN: image_sample_d v[0:3], v[1:4], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABAL: {{^}}sample_d_3d: +; GCN: image_sample_d v[0:3], v[2:9], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_d_1d: +; GCN: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_d_2d: +; GCN: image_sample_c_d v[0:3], v[1:4], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_d_cl_1d: +; GCN: image_sample_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_d_cl_2d: +; GCN: image_sample_d_cl v[0:3], v[2:5], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_d_cl_1d: +; GCN: image_sample_c_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_d_cl_2d: +; GCN: image_sample_c_d_cl v[0:3], v[2:9], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cd_1d: +; GCN: image_sample_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cd_2d: +; GCN: image_sample_cd v[0:3], v[1:4], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_cd_1d: +; GCN: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_cd_2d: +; GCN: image_sample_c_cd v[0:3], v[1:4], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cd_cl_1d: +; GCN: image_sample_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cd_cl_2d: +; GCN: image_sample_cd_cl v[0:3], v[2:5], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_cd_cl_1d: +; GCN: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_cd_cl_2d: +; GCN: image_sample_c_cd_cl v[0:3], v[2:9], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_l_1d: +; GCN: image_sample_l v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half %s, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_l_2d: +; GCN: image_sample_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_l_1d: +; GCN: image_sample_c_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float %zcompare, half %s, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_l_2d: +; GCN: image_sample_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_lz_1d: +; GCN: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16(i32 15, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_lz_2d: +; GCN: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32 15, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_lz_1d: +; GCN: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16(i32 15, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_lz_2d: +; GCN: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16{{$}} +define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_d_o_2darray_V1: +; GCN: image_sample_c_d_o v0, v[2:9], s[0:7], s[8:11] dmask:0x4 a16 da{{$}} +define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) { +main_body: + %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f16(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret float %v +} + +; GCN-LABEL: {{^}}sample_c_d_o_2darray_V2: +; GCN: image_sample_c_d_o v[0:1], v[2:9], s[0:7], s[8:11] dmask:0x6 a16 da{{$}} +define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) { +main_body: + %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f16(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <2 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f16(i32, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16(i32, float, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f16(i32, float, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f16(i32, float, float, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f16(i32, float, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f16(i32, float, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f16(i32, float, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32, half, half, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f16(i32, float, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16(i32, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16(i32, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f16(i32, float, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f16(i32, float, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f16(i32, float, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f16(i32, float, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f16(i32, float, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16(i32, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16(i32, float, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f16(i32, i32, float, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f16(i32, i32, float, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } Index: test/MC/AMDGPU/mimg.s =================================================================== --- test/MC/AMDGPU/mimg.s +++ test/MC/AMDGPU/mimg.s @@ -267,6 +267,30 @@ // GFX8_1: image_sample v[193:194], v[237:240], s[28:35], s[4:7] dmask:0x7 d16 ; encoding: [0x00,0x07,0x80,0xf0,0xed,0xc1,0x27,0x80] // GFX9: image_sample v[193:194], v[237:240], s[28:35], s[4:7] dmask:0x7 d16 ; encoding: [0x00,0x07,0x80,0xf0,0xed,0xc1,0x27,0x80] +//===----------------------------------------------------------------------===// +// Image Sample: a16 +//===----------------------------------------------------------------------===// + +image_sample v[193:196], v[237:240], s[28:35], s[4:7] dmask:0xf a16 +// GFX9: image_sample v[193:196], v[237:240], s[28:35], s[4:7] dmask:0xf a16 ; encoding: [0x00,0x8f,0x80,0xf0,0xed,0xc1,0x27,0x00] +// NOSICI: error: a16 modifier is not supported on this GPU +// NOVI: error: a16 modifier is not supported on this GPU + +image_sample_d v[193:196], v[237:240], s[28:35], s[4:7] dmask:0xf a16 +// GFX9: image_sample_d v[193:196], v[237:240], s[28:35], s[4:7] dmask:0xf a16 ; encoding: [0x00,0x8f,0x88,0xf0,0xed,0xc1,0x27,0x00] +// NOSICI: error: a16 modifier is not supported on this GPU +// NOVI: error: a16 modifier is not supported on this GPU + +image_sample_c_d v[193:196], v[237:240], s[28:35], s[4:7] dmask:0xf a16 +// GFX9: image_sample_c_d v[193:196], v[237:240], s[28:35], s[4:7] dmask:0xf a16 ; encoding: [0x00,0x8f,0xa8,0xf0,0xed,0xc1,0x27,0x00] +// NOSICI: error: a16 modifier is not supported on this GPU +// NOVI: error: a16 modifier is not supported on this GPU + +image_sample_c_d_cl v[193:196], v[237:240], s[28:35], s[4:7] dmask:0xf a16 +// GFX9: image_sample_c_d_cl v[193:196], v[237:240], s[28:35], s[4:7] dmask:0xf a16 ; encoding: [0x00,0x8f,0xac,0xf0,0xed,0xc1,0x27,0x00] +// NOSICI: error: a16 modifier is not supported on this GPU +// NOVI: error: a16 modifier is not supported on this GPU + //===----------------------------------------------------------------------===// // Image Atomics //===----------------------------------------------------------------------===// @@ -372,3 +396,13 @@ // NOGFX8_0: error: image data size does not match dmask and tfe // NOGFX8_1: error: image data size does not match dmask and tfe // NOGFX9: error: image data size does not match dmask and tfe + +image_gather4 v[5:8], v1, s[8:15], s[12:15] dmask:0x1 a16 +// GFX9: image_gather4 v[5:8], v1, s[8:15], s[12:15] dmask:0x1 a16 ; encoding: [0x00,0x81,0x00,0xf1,0x01,0x05,0x62,0x00] +// NOSICI: error: a16 modifier is not supported on this GPU +// NOVI: error: a16 modifier is not supported on this GPU + +image_gather4_b_cl v[5:8], v[1:4], s[8:15], s[12:15] dmask:0x1 a16 +// GFX9: image_gather4_b_cl v[5:8], v[1:4], s[8:15], s[12:15] dmask:0x1 a16 ; encoding: [0x00,0x81,0x18,0xf1,0x01,0x05,0x62,0x00] +// NOSICI: error: a16 modifier is not supported on this GPU +// NOVI: error: a16 modifier is not supported on this GPU