Index: lib/Target/AMDGPU/AMDGPUCallLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUCallLowering.h +++ lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -38,7 +38,8 @@ unsigned VReg) const override; bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef VRegs) const override; - CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; + static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); + static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg); }; } // End of namespace llvm; #endif Index: lib/Target/AMDGPU/AMDGPUCallingConv.td =================================================================== --- lib/Target/AMDGPU/AMDGPUCallingConv.td +++ lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -13,6 +13,8 @@ // Inversion of CCIfInReg class CCIfNotInReg : CCIf<"!ArgFlags.isInReg()", A> {} +class CCIfExtend + : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>; // Calling convention for SI def CC_SI : CallingConv<[ @@ -52,7 +54,7 @@ ]>>> ]>; -def RetCC_SI : CallingConv<[ +def RetCC_SI_Shader : CallingConv<[ CCIfType<[i32] , CCAssignToReg<[ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, @@ -99,6 +101,52 @@ CCCustom<"allocateKernArg"> ]>; +def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs< + (sequence "VGPR%u", 24, 255) +>; + +def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs< + (sequence "VGPR%u", 32, 255) +>; + +def CSR_AMDGPU_SGPRs_32_103 : CalleeSavedRegs< + (sequence "SGPR%u", 32, 103) +>; + +def CSR_AMDGPU_HighRegs : CalleeSavedRegs< + (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_103) +>; + +// Calling convention for leaf functions +def CC_AMDGPU_Func : CallingConv<[ + CCIfByVal>, + CCIfType<[i1], CCPromoteToType>, + CCIfType<[i1, i8, i16], CCIfExtend>>, + CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[ + VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, + VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, + VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, + VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, + CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>, + CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>, + CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>, + CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>, + CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>, + CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>> +]>; + +// Calling convention for leaf functions +def RetCC_AMDGPU_Func : CallingConv<[ + CCIfType<[i1], CCPromoteToType>, + CCIfType<[i1, i16], CCIfExtend>>, + CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[ + VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, + VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, + VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, + VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, + CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">> +]>; + def CC_AMDGPU : CallingConv<[ CCIf<"static_cast" "(State.getMachineFunction().getSubtarget()).getGeneration() >=" Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -115,9 +115,6 @@ SmallVectorImpl &Results) const; void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl &Ins) const; - void AnalyzeReturn(CCState &State, - const SmallVectorImpl &Outs) const; - public: AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI); @@ -162,6 +159,8 @@ bool isCheapToSpeculateCtlz() const override; static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); + static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg); + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &DL, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -76,6 +76,45 @@ } } +// Allocate up to VGPR31. +// +// TODO: Since there are no VGPR alignent requirements would it be better to +// split into individual scalar registers? +static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + switch (LocVT.SimpleTy) { + case MVT::i64: + case MVT::f64: + case MVT::v2i32: + case MVT::v2f32: { + return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, + &AMDGPU::VReg_64RegClass, 31); + } + case MVT::v4i32: + case MVT::v4f32: + case MVT::v2i64: + case MVT::v2f64: { + return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, + &AMDGPU::VReg_128RegClass, 29); + } + case MVT::v8i32: + case MVT::v8f32: { + return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, + &AMDGPU::VReg_256RegClass, 25); + + } + case MVT::v16i32: + case MVT::v16f32: { + return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, + &AMDGPU::VReg_512RegClass, 17); + + } + default: + return false; + } +} + #include "AMDGPUGenCallingConv.inc" // Find a larger type to do a load / store of a vector with. @@ -767,8 +806,42 @@ //===---------------------------------------------------------------------===// CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, - bool IsVarArg) const { - return CC_AMDGPU; + bool IsVarArg) { + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return CC_AMDGPU_Kernel; + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_HS: + return CC_AMDGPU; + case CallingConv::C: + case CallingConv::Fast: + return CC_AMDGPU_Func; + default: + report_fatal_error("Unsupported calling convention."); + } +} + +CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, + bool IsVarArg) { + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return CC_AMDGPU_Kernel; + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + return RetCC_SI_Shader; + case CallingConv::C: + case CallingConv::Fast: + return RetCC_AMDGPU_Func; + default: + report_fatal_error("Unsupported calling convention."); + } } /// The SelectionDAGBuilder will automatically promote function arguments @@ -868,18 +941,15 @@ } } -void AMDGPUTargetLowering::AnalyzeReturn(CCState &State, - const SmallVectorImpl &Outs) const { - - State.AnalyzeReturn(Outs, RetCC_SI); -} - -SDValue -AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - const SDLoc &DL, SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::LowerReturn( + SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SDLoc &DL, SelectionDAG &DAG) const { + // FIXME: Fails for r600 tests + //assert(!isVarArg && Outs.empty() && OutVals.empty() && + // "wave terminate should not have return values"); return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain); } @@ -890,20 +960,12 @@ /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) { - switch (CC) { - case CallingConv::C: - case CallingConv::AMDGPU_KERNEL: - case CallingConv::SPIR_KERNEL: - return CC_AMDGPU_Kernel; - case CallingConv::AMDGPU_VS: - case CallingConv::AMDGPU_HS: - case CallingConv::AMDGPU_GS: - case CallingConv::AMDGPU_PS: - case CallingConv::AMDGPU_CS: - return CC_AMDGPU; - default: - report_fatal_error("Unsupported calling convention."); - } + return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg); +} + +CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, + bool IsVarArg) { + return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg); } SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -380,6 +380,6 @@ def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; -def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, +def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] >; Index: lib/Target/AMDGPU/AMDGPUMCInstLower.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -126,9 +126,15 @@ } void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { + unsigned Opcode = MI->getOpcode(); - int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode()); + // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We + // need to select it to the subtarget specific version, and there's no way to + // do that with a single pseudo source operation. + if (Opcode == AMDGPU::S_SETPC_B64_return) + Opcode = AMDGPU::S_SETPC_B64; + int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(Opcode); if (MCOpcode == -1) { LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext(); C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have " Index: lib/Target/AMDGPU/AMDGPUMachineFunction.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -12,21 +12,6 @@ using namespace llvm; -static bool isEntryFunctionCC(CallingConv::ID CC) { - switch (CC) { - case CallingConv::AMDGPU_KERNEL: - case CallingConv::SPIR_KERNEL: - case CallingConv::AMDGPU_VS: - case CallingConv::AMDGPU_HS: - case CallingConv::AMDGPU_GS: - case CallingConv::AMDGPU_PS: - case CallingConv::AMDGPU_CS: - return true; - default: - return false; - } -} - AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), LocalMemoryObjects(), @@ -34,7 +19,7 @@ MaxKernArgAlign(0), LDSSize(0), ABIArgOffset(0), - IsEntryFunction(isEntryFunctionCC(MF.getFunction()->getCallingConv())), + IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction()->getCallingConv())), NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) { // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset, // except reserved size is not correctly aligned. Index: lib/Target/AMDGPU/AMDGPURegisterInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -30,9 +30,6 @@ /// \returns the sub reg enum value for the given \p Channel /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) unsigned getSubRegFromChannel(unsigned Channel) const; - - const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override; - unsigned getFrameRegister(const MachineFunction &MF) const override; }; } // End namespace llvm Index: lib/Target/AMDGPU/AMDGPURegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -14,6 +14,7 @@ #include "AMDGPURegisterInfo.h" #include "AMDGPUTargetMachine.h" +#include "SIRegisterInfo.h" using namespace llvm; @@ -24,18 +25,6 @@ // they are not supported at this time. //===----------------------------------------------------------------------===// -// Dummy to not crash RegisterClassInfo. -static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister; - -const MCPhysReg *AMDGPURegisterInfo::getCalleeSavedRegs( - const MachineFunction *) const { - return &CalleeSavedReg; -} - -unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const { - return AMDGPU::NoRegister; -} - unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { static const unsigned SubRegs[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, @@ -50,3 +39,35 @@ #define GET_REGINFO_TARGET_DESC #include "AMDGPUGenRegisterInfo.inc" + + +// Forced to be here by one .inc +const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( + const MachineFunction *MF) const { + CallingConv::ID CC = MF->getFunction()->getCallingConv(); + switch (CC) { + case CallingConv::C: + case CallingConv::Fast: + return CSR_AMDGPU_HighRegs_SaveList; + default: { + // Dummy to not crash RegisterClassInfo. + static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; + return &NoCalleeSavedReg; + } + } +} + +const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + switch (CC) { + case CallingConv::C: + case CallingConv::Fast: + return CSR_AMDGPU_HighRegs_RegMask; + default: + return nullptr; + } +} + +unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { + return AMDGPU::NoRegister; +} Index: lib/Target/AMDGPU/R600RegisterInfo.h =================================================================== --- lib/Target/AMDGPU/R600RegisterInfo.h +++ lib/Target/AMDGPU/R600RegisterInfo.h @@ -27,6 +27,8 @@ R600RegisterInfo(); BitVector getReservedRegs(const MachineFunction &MF) const override; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + unsigned getFrameRegister(const MachineFunction &MF) const override; /// \brief get the HW encoding for a register's channel. unsigned getHWRegChan(unsigned reg) const; Index: lib/Target/AMDGPU/R600RegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/R600RegisterInfo.cpp +++ lib/Target/AMDGPU/R600RegisterInfo.cpp @@ -56,6 +56,18 @@ return Reserved; } +// Dummy to not crash RegisterClassInfo. +static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister; + +const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs( + const MachineFunction *) const { + return &CalleeSavedReg; +} + +unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const { + return AMDGPU::NoRegister; +} + unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const { return this->getEncodingValue(reg) >> HW_CHAN_SHIFT; } Index: lib/Target/AMDGPU/SIFrameLowering.h =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.h +++ lib/Target/AMDGPU/SIFrameLowering.h @@ -26,6 +26,8 @@ AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} ~SIFrameLowering() override = default; + void emitEntryFunctionPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const; void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -189,8 +189,6 @@ // ---- // 13 (+1) unsigned ReservedRegCount = 13; - if (SPReg != AMDGPU::NoRegister) - ++ReservedRegCount; if (AllSGPRs.size() < ReservedRegCount) return std::make_pair(ScratchWaveOffsetReg, SPReg); @@ -208,13 +206,6 @@ MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); MFI->setScratchWaveOffsetReg(Reg); ScratchWaveOffsetReg = Reg; - } else { - if (SPReg == AMDGPU::NoRegister) - break; - - MRI.replaceRegWith(SPReg, Reg); - MFI->setStackPtrOffsetReg(Reg); - SPReg = Reg; break; } } @@ -223,8 +214,8 @@ return std::make_pair(ScratchWaveOffsetReg, SPReg); } -void SIFrameLowering::emitPrologue(MachineFunction &MF, - MachineBasicBlock &MBB) const { +void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was // specified. const SISubtarget &ST = MF.getSubtarget(); @@ -424,6 +415,13 @@ } } +void SIFrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + const SIMachineFunctionInfo *MFI = MF.getInfo(); + if (MFI->isEntryFunction()) + emitEntryFunctionPrologue(MF, MBB); +} + void SIFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -28,6 +28,10 @@ uint64_t Offset, bool Signed, const ISD::InputArg *Arg = nullptr) const; + SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, + const SDLoc &SL, SDValue Chain, + const ISD::InputArg &Arg) const; + SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, @@ -178,7 +182,12 @@ const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const override; - SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + bool CanLowerReturn(CallingConv::ID CallConv, + MachineFunction &MF, bool isVarArg, + const SmallVectorImpl &Outs, + LLVMContext &Context) const override; + + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -918,6 +918,55 @@ return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); } +SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, + const SDLoc &SL, SDValue Chain, + const ISD::InputArg &Arg) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + if (Arg.Flags.isByVal()) { + unsigned Size = Arg.Flags.getByValSize(); + int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false); + return DAG.getFrameIndex(FrameIdx, MVT::i32); + } + + unsigned ArgOffset = VA.getLocMemOffset(); + unsigned ArgSize = VA.getValVT().getStoreSize(); + + int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true); + + // Create load nodes to retrieve arguments from the stack. + SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); + SDValue ArgValue; + + // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) + ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; + MVT MemVT = VA.getValVT(); + + switch (VA.getLocInfo()) { + default: + break; + case CCValAssign::BCvt: + MemVT = VA.getLocVT(); + break; + case CCValAssign::SExt: + ExtType = ISD::SEXTLOAD; + break; + case CCValAssign::ZExt: + ExtType = ISD::ZEXTLOAD; + break; + case CCValAssign::AExt: + ExtType = ISD::EXTLOAD; + break; + } + + ArgValue = DAG.getExtLoad( + ExtType, SL, VA.getLocVT(), Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + MemVT); + return ArgValue; +} + static void processShaderInputArgs(SmallVectorImpl &Splits, CallingConv::ID CallConv, ArrayRef Ins, @@ -1098,10 +1147,12 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) { + SIMachineFunctionInfo &Info, + bool NeedSP) { // Now that we've figured out where the scratch register inputs are, see if // should reserve the arguments and use them directly. - bool HasStackObjects = MF.getFrameInfo().hasStackObjects(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + bool HasStackObjects = MFI.hasStackObjects(); // Record that we know we have non-spill stack objects so we don't need to // check all stack objects later. @@ -1159,6 +1210,15 @@ Info.setScratchWaveOffsetReg(ReservedOffsetReg); } } + + if (NeedSP){ + unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF); + Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg); + + assert(Info.getStackPtrOffsetReg() != Info.getFrameOffsetReg()); + assert(!TRI.isSubRegister(Info.getScratchRSrcReg(), + Info.getStackPtrOffsetReg())); + } } SDValue SITargetLowering::LowerFormalArguments( @@ -1227,8 +1287,10 @@ !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()); + } else if (IsKernel) { + assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); } else { - assert(!IsKernel || (Info->hasWorkGroupIDX() && Info->hasWorkItemIDX())); + Splits.append(Ins.begin(), Ins.end()); } if (IsEntryFunc) { @@ -1282,11 +1344,14 @@ InVals.push_back(Arg); continue; + } else if (!IsEntryFunc && VA.isMemLoc()) { + SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg); + InVals.push_back(Val); + if (!Arg.Flags.isByVal()) + Chains.push_back(Val.getValue(1)); + continue; } - if (VA.isMemLoc()) - report_fatal_error("memloc not supported with calling convention"); - assert(VA.isRegLoc() && "Parameter must be in a register!"); unsigned Reg = VA.getLocReg(); @@ -1295,7 +1360,7 @@ Reg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); - if (Arg.VT.isVector()) { + if (IsShader && Arg.VT.isVector()) { // Build a vector from the registers Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); unsigned NumElements = ParamType->getVectorNumElements(); @@ -1321,16 +1386,49 @@ InVals.push_back(Val); } + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + + // TODO: Could maybe omit SP if only tail calls? + bool NeedSP = FrameInfo.hasCalls() || FrameInfo.hasVarSizedObjects(); + // Start adding system SGPRs. - if (IsEntryFunc) + if (IsEntryFunc) { allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader); - - reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); + reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info, NeedSP); + } else { + CCInfo.AllocateReg(Info->getScratchRSrcReg()); + CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); + CCInfo.AllocateReg(Info->getFrameOffsetReg()); + + if (NeedSP) { + unsigned StackPtrReg = findFirstFreeSGPR(CCInfo); + CCInfo.AllocateReg(StackPtrReg); + Info->setStackPtrOffsetReg(StackPtrReg); + } + } return Chains.empty() ? Chain : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } +// TODO: If return values can't fit in registers, we should return as many as +// possible in registers before passing on stack. +bool SITargetLowering::CanLowerReturn( + CallingConv::ID CallConv, + MachineFunction &MF, bool IsVarArg, + const SmallVectorImpl &Outs, + LLVMContext &Context) const { + // Replacing returns with sret/stack usage doesn't make sense for shaders. + // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn + // for shaders. Vector types should be explicitly handled by CC. + if (AMDGPU::isEntryFunctionCC(CallConv)) + return true; + + SmallVector RVLocs; + CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context); + return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)); +} + SDValue SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -1340,11 +1438,15 @@ MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo(); - if (!AMDGPU::isShader(CallConv)) + if (AMDGPU::isKernel(CallConv)) { return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, OutVals, DL, DAG); + } + + bool IsShader = AMDGPU::isShader(CallConv); Info->setIfReturnsVoid(Outs.size() == 0); + bool IsWaveEnd = Info->returnsVoid() && IsShader; SmallVector Splits; SmallVector SplitVals; @@ -1353,7 +1455,7 @@ for (unsigned i = 0, e = Outs.size(); i != e; ++i) { const ISD::OutputArg &Out = Outs[i]; - if (Out.VT.isVector()) { + if (IsShader && Out.VT.isVector()) { MVT VT = Out.VT.getVectorElementType(); ISD::OutputArg NewOut = Out; NewOut.Flags.setSplit(); @@ -1384,29 +1486,58 @@ *DAG.getContext()); // Analyze outgoing return values. - AnalyzeReturn(CCInfo, Splits); + CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg)); SDValue Flag; SmallVector RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) + // Add return address for callable functions. + if (!Info->isEntryFunction()) { + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); + SDValue ReturnAddrReg = CreateLiveInRegister( + DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); + + // FIXME: Should be able to use a vreg here, but need a way to prevent it + // from being allcoated to a CSR. + + SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF), + MVT::i64); + + Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag); + Flag = Chain.getValue(1); + + RetOps.push_back(PhysReturnAddrReg); + } + // Copy the result values into the output registers. for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); ++i, ++realRVLocIdx) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); + // TODO: Partially return in registers if return values don't fit. SDValue Arg = SplitVals[realRVLocIdx]; // Copied from other backends. switch (VA.getLocInfo()) { - default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); + break; + default: + llvm_unreachable("Unknown loc info!"); } Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); @@ -1414,12 +1545,16 @@ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } + // FIXME: Does sret work properly? + // Update chain and glue. RetOps[0] = Chain; if (Flag.getNode()) RetOps.push_back(Flag); - unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN_TO_EPILOG; + unsigned Opc = AMDGPUISD::ENDPGM; + if (!IsWaveEnd) + Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG; return DAG.getNode(Opc, DL, MVT::Other, RetOps); } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -388,9 +388,8 @@ void setScratchWaveOffsetReg(unsigned Reg) { assert(Reg != AMDGPU::NoRegister && "Should never be unset"); ScratchWaveOffsetReg = Reg; - - // FIXME: Only for entry functions. - FrameOffsetReg = ScratchWaveOffsetReg; + if (isEntryFunction()) + FrameOffsetReg = ScratchWaveOffsetReg; } unsigned getQueuePtrUserSGPR() const { Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -80,17 +80,22 @@ FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F); WavesPerEU = ST.getWavesPerEU(*F); - // Non-entry functions have no special inputs for now. - // TODO: Return early for non-entry CCs. + if (!isEntryFunction()) { + // Non-entry functions have no special inputs for now, other registers + // required for scratch access. + ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; + ScratchWaveOffsetReg = AMDGPU::SGPR4; + FrameOffsetReg = AMDGPU::SGPR5; + return; + } CallingConv::ID CC = F->getCallingConv(); - if (CC == CallingConv::AMDGPU_PS) - PSInputAddr = AMDGPU::getInitialPSInputAddr(*F); - - if (AMDGPU::isKernel(CC)) { + if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { KernargSegmentPtr = true; WorkGroupIDX = true; WorkItemIDX = true; + } else if (CC == CallingConv::AMDGPU_PS) { + PSInputAddr = AMDGPU::getInitialPSInputAddr(*F); } if (ST.debuggerEmitPrologue()) { @@ -120,7 +125,7 @@ const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); bool MaySpill = ST.isVGPRSpillingEnabled(*F); - bool HasStackObjects = FrameInfo.hasStackObjects(); + bool HasStackObjects = FrameInfo.hasStackObjects() || FrameInfo.hasCalls(); if (HasStackObjects || MaySpill) { PrivateSegmentWaveByteOffset = true; Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -17,6 +17,7 @@ #include "AMDGPURegisterInfo.h" #include "SIDefines.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/MachineRegisterInfo.h" namespace llvm { @@ -57,8 +58,16 @@ unsigned reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const; + unsigned reservedStackPtrOffsetReg(const MachineFunction &MF) const; + BitVector getReservedRegs(const MachineFunction &MF) const override; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const uint32_t *getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID) const override; + + unsigned getFrameRegister(const MachineFunction &MF) const override; + bool requiresRegisterScavenging(const MachineFunction &Fn) const override; bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; @@ -228,6 +237,11 @@ const int *getRegUnitPressureSets(unsigned RegUnit) const override; + unsigned getReturnAddressReg(const MachineFunction &MF) const { + // Not a callee saved register. + return AMDGPU::SGPR30_SGPR31; + } + private: void buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -117,11 +117,7 @@ return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); } -unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( - const MachineFunction &MF) const { - - const SISubtarget &ST = MF.getSubtarget(); - unsigned RegCount = ST.getMaxNumSGPRs(MF); +static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { unsigned Reg; // Try to place it in a hole after PrivateSegmentBufferReg. @@ -134,9 +130,22 @@ // wave offset before it. Reg = RegCount - 5; } + + return Reg; +} + +unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const { + const SISubtarget &ST = MF.getSubtarget(); + unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF)); return AMDGPU::SGPR_32RegClass.getRegister(Reg); } +unsigned SIRegisterInfo::reservedStackPtrOffsetReg( + const MachineFunction &MF) const { + return AMDGPU::SGPR32; +} + BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); @@ -198,15 +207,33 @@ assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); } + unsigned StackPtrReg = MFI->getStackPtrOffsetReg(); + if (StackPtrReg != AMDGPU::NoRegister) { + reserveRegisterTuples(Reserved, StackPtrReg); + assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); + } + + unsigned FrameReg = MFI->getFrameOffsetReg(); + if (FrameReg != AMDGPU::NoRegister) { + reserveRegisterTuples(Reserved, FrameReg); + assert(!isSubRegister(ScratchRSrcReg, FrameReg)); + } + return Reserved; } bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { - return Fn.getFrameInfo().hasStackObjects(); + const SIMachineFunctionInfo *Info = Fn.getInfo(); + if (Info->isEntryFunction()) { + const MachineFrameInfo &MFI = Fn.getFrameInfo(); + return MFI.hasStackObjects() || MFI.hasCalls(); + } + + // May need scavenger for dealing with callee saved registers. + return true; } -bool -SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const { +bool SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const { return MF.getFrameInfo().hasStackObjects(); } Index: lib/Target/AMDGPU/SOPInstructions.td =================================================================== --- lib/Target/AMDGPU/SOPInstructions.td +++ lib/Target/AMDGPU/SOPInstructions.td @@ -186,11 +186,23 @@ def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">; def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">; -let isTerminator = 1, isBarrier = 1, - isBranch = 1, isIndirectBranch = 1 in { +let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in { + +let isBranch = 1, isIndirectBranch = 1 in { def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">; +} // End isBranch = 1, isIndirectBranch = 1 + +let isReturn = 1 in { +// Define variant marked as return rather than branch. +def S_SETPC_B64_return : SOP1_1<"", [(AMDGPUret_flag i64:$src0)]>; +} +} // End isTerminator = 1, isBarrier = 1 + +let isCall = 1 in { +def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64" +>; } -def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64">; + def S_RFE_B64 : SOP1_1 <"s_rfe_b64">; let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in { Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -262,7 +262,6 @@ LLVM_READNONE inline bool isKernel(CallingConv::ID CC) { switch (CC) { - case CallingConv::C: case CallingConv::AMDGPU_KERNEL: case CallingConv::SPIR_KERNEL: return true; Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -518,7 +518,18 @@ } bool isEntryFunctionCC(CallingConv::ID CC) { - return true; + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_HS: + return true; + default: + return false; + } } bool isSI(const MCSubtargetInfo &STI) { Index: test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll +++ test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll @@ -6,7 +6,8 @@ ; Tests for add. ; CHECK: name: addi32 ; CHECK: {{%[0-9]+}}(s32) = G_ADD -define i32 @addi32(i32 %arg1, i32 %arg2) { +define amdgpu_kernel void @addi32(i32 %arg1, i32 %arg2) { %res = add i32 %arg1, %arg2 - ret i32 %res + store i32 %res, i32 addrspace(1)* undef + ret void } Index: test/CodeGen/AMDGPU/frame-index-elimination.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -0,0 +1,124 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Test that non-entry function frame indices are expanded properly to +; give an index relative to the scratch wave offset register + +; Materialize into a mov. Make sure there isn't an unnecessary copy. +; GCN-LABEL: {{^}}func_mov_fi_i32: +; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN: s_sub_u32 vcc_hi, s5, s4 +; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6 +; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4 +; GCN-NOT: v_mov +; GCN: ds_write_b32 v0, v0 +define void @func_mov_fi_i32() #0 { + %alloca = alloca i32 + store volatile i32* %alloca, i32* addrspace(3)* undef + ret void +} + +; Materialize into an add of a constant offset from the FI. +; FIXME: Should be able to merge adds + +; GCN-LABEL: {{^}}func_add_constant_to_fi_i32: +; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN: s_sub_u32 s6, s5, s4 +; GCN-NEXT: s_lshr_b32 s6, s6, 6 +; GCN-NEXT: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, s6, 4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN-NOT: v_mov +; GCN: ds_write_b32 v0, v0 +define void @func_add_constant_to_fi_i32() #0 { + %alloca = alloca [2 x i32], align 4 + %gep0 = getelementptr inbounds [2 x i32], [2 x i32]* %alloca, i32 0, i32 1 + store volatile i32* %gep0, i32* addrspace(3)* undef + ret void +} + +; A user the materialized frame index can't be meaningfully folded +; into. + +; GCN-LABEL: {{^}}func_other_fi_user_i32: +; GCN: s_sub_u32 vcc_hi, s5, s4 +; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6 +; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4 +; GCN-NEXT: v_mul_lo_i32 v0, v0, 9 +; GCN-NOT: v_mov +; GCN: ds_write_b32 v0, v0 +define void @func_other_fi_user_i32() #0 { + %alloca = alloca [2 x i32], align 4 + %ptrtoint = ptrtoint [2 x i32]* %alloca to i32 + %mul = mul i32 %ptrtoint, 9 + store volatile i32 %mul, i32 addrspace(3)* undef + ret void +} + +; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr: +; GCN: v_mov_b32_e32 v1, 15{{$}} +; GCN: buffer_store_dword v1, v0, s[0:3], s4 offen{{$}} +define void @func_store_private_arg_i32_ptr(i32* %ptr) #0 { + store volatile i32 15, i32* %ptr + ret void +} + +; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr: +; GCN: s_waitcnt +; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s4 offen{{$}} +define void @func_load_private_arg_i32_ptr(i32* %ptr) #0 { + %val = load volatile i32, i32* %ptr + ret void +} + +; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr: +; GCN: s_waitcnt +; GCN-NEXT: s_sub_u32 s6, s5, s4 +; GCN-NEXT: v_lshr_b32_e64 v0, s6, 6 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN-NOT: v_mov +; GCN: ds_write_b32 v0, v0 +define void @void_func_byval_struct_i8_i32_ptr({ i8, i32 }* byval %arg0) #0 { + %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0 + %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1 + %load1 = load i32, i32* %gep1 + store volatile i32* %gep1, i32* addrspace(3)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_value: +; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_ubyte v0, off, s[0:3], s5 +; GCN_NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4 +define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 }* byval %arg0) #0 { + %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0 + %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1 + %load0 = load i8, i8* %gep0 + %load1 = load i32, i32* %gep1 + store volatile i8 %load0, i8 addrspace(3)* undef + store volatile i32 %load1, i32 addrspace(3)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block: +; GCN: s_sub_u32 s8, s5, s4 +; GCN: v_lshr_b32_e64 v1, s8, 6 +; GCN: s_and_saveexec_b64 + +; GCN: v_add_i32_e32 v0, vcc, 4, v1 +; GCN: buffer_load_dword v1, v1, s[0:3], s4 offen offset:4 +; GCN: ds_write_b32 +define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 }* byval %arg0, i32 %arg2) #0 { + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %bb, label %ret + +bb: + %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0 + %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1 + %load1 = load volatile i32, i32* %gep1 + store volatile i32* %gep1, i32* addrspace(3)* undef + br label %ret + +ret: + ret void +} + +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/function-args.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/function-args.ll @@ -0,0 +1,734 @@ +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}void_func_i1: +; GCN: v_and_b32_e32 v0, 1, v0 +; GCN: buffer_store_byte v0, off +define void @void_func_i1(i1 %arg0) #0 { + store i1 %arg0, i1 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i1_zeroext: +; GCN: s_waitcnt +; GCN-NEXT: v_or_b32_e32 v0, 12, v0 +; GCN-NOT: v0 +; GCN: buffer_store_dword v0, off +define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 { + %ext = zext i1 %arg0 to i32 + %add = add i32 %ext, 12 + store i32 %add, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i1_signext: +; GCN: s_waitcnt +; GCN-NEXT: v_add_i32_e32 v0, vcc, 12, v0 +; GCN-NOT: v0 +; GCN: buffer_store_dword v0, off +define void @void_func_i1_signext(i1 signext %arg0) #0 { + %ext = sext i1 %arg0 to i32 + %add = add i32 %ext, 12 + store i32 %add, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i8: +; GCN-NOT: v0 +; GCN: buffer_store_byte v0, off +define void @void_func_i8(i8 %arg0) #0 { + store i8 %arg0, i8 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i8_zeroext: +; GCN-NOT: and_b32 +; GCN: v_add_i32_e32 v0, vcc, 12, v0 +define void @void_func_i8_zeroext(i8 zeroext %arg0) #0 { + %ext = zext i8 %arg0 to i32 + %add = add i32 %ext, 12 + store i32 %add, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i8_signext: +; GCN-NOT: v_bfe_i32 +; GCN: v_add_i32_e32 v0, vcc, 12, v0 +define void @void_func_i8_signext(i8 signext %arg0) #0 { + %ext = sext i8 %arg0 to i32 + %add = add i32 %ext, 12 + store i32 %add, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i16: +; GCN: buffer_store_short v0, off +define void @void_func_i16(i16 %arg0) #0 { + store i16 %arg0, i16 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i16_zeroext: +; GCN-NOT: v0 +; GCN: v_add_i32_e32 v0, vcc, 12, v0 +define void @void_func_i16_zeroext(i16 zeroext %arg0) #0 { + %ext = zext i16 %arg0 to i32 + %add = add i32 %ext, 12 + store i32 %add, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i16_signext: +; GCN-NOT: v0 +; GCN: v_add_i32_e32 v0, vcc, 12, v0 +define void @void_func_i16_signext(i16 signext %arg0) #0 { + %ext = sext i16 %arg0 to i32 + %add = add i32 %ext, 12 + store i32 %add, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i32: +; GCN-NOT: v0 +; GCN: buffer_store_dword v0, off +define void @void_func_i32(i32 %arg0) #0 { + store i32 %arg0, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i64: +; GCN-NOT: v[0:1] +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: buffer_store_dwordx2 v[0:1], off +define void @void_func_i64(i64 %arg0) #0 { + store i64 %arg0, i64 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_f16: +; VI-NOT: v0 +; CI: v_cvt_f16_f32_e32 v0, v0 +; GCN: buffer_store_short v0, off +define void @void_func_f16(half %arg0) #0 { + store half %arg0, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_f32 +; GCN-NOT: v0 +; GCN: buffer_store_dword v0, off +define void @void_func_f32(float %arg0) #0 { + store float %arg0, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_f64: +; GCN-NOT: v[0:1] +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: buffer_store_dwordx2 v[0:1], off +define void @void_func_f64(double %arg0) #0 { + store double %arg0, double addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v2i32: +; GCN-NOT: v[0:1] +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: buffer_store_dwordx2 v[0:1], off +define void @void_func_v2i32(<2 x i32> %arg0) #0 { + store <2 x i32> %arg0, <2 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v3i32: +; GCN-DAG: buffer_store_dword v2, off +; GCN-DAG: buffer_store_dwordx2 v[0:1], off +define void @void_func_v3i32(<3 x i32> %arg0) #0 { + store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v4i32: +; GCN: buffer_store_dwordx4 v[0:3], off +define void @void_func_v4i32(<4 x i32> %arg0) #0 { + store <4 x i32> %arg0, <4 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v5i32: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dword v4, off +define void @void_func_v5i32(<5 x i32> %arg0) #0 { + store <5 x i32> %arg0, <5 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v8i32: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +define void @void_func_v8i32(<8 x i32> %arg0) #0 { + store <8 x i32> %arg0, <8 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v16i32: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx4 v[8:11], off +; GCN-DAG: buffer_store_dwordx4 v[12:15], off +define void @void_func_v16i32(<16 x i32> %arg0) #0 { + store <16 x i32> %arg0, <16 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v32i32: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx4 v[8:11], off +; GCN-DAG: buffer_store_dwordx4 v[12:15], off +; GCN-DAG: buffer_store_dwordx4 v[16:19], off +; GCN-DAG: buffer_store_dwordx4 v[20:23], off +; GCN-DAG: buffer_store_dwordx4 v[24:27], off +; GCN-DAG: buffer_store_dwordx4 v[28:31], off +define void @void_func_v32i32(<32 x i32> %arg0) #0 { + store <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + ret void +} + +; 1 over register limit +; GCN-LABEL: {{^}}void_func_v33i32: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx4 v[8:11], off +; GCN-DAG: buffer_store_dwordx4 v[12:15], off +; GCN-DAG: buffer_load_dword [[STACKLOAD:v[0-9]+]], off, s[0:3], s5 +; GCN-DAG: buffer_store_dwordx4 v[16:19], off +; GCN-DAG: buffer_store_dwordx4 v[20:23], off +; GCN-DAG: buffer_store_dwordx4 v[24:27], off +; GCN-DAG: buffer_store_dwordx4 v[28:31], off +; GCN: buffer_store_dword [[STACKLOAD]], off +define void @void_func_v33i32(<33 x i32> %arg0) #0 { + store <33 x i32> %arg0, <33 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v2i64: +; GCN: buffer_store_dwordx4 v[0:3], off +define void @void_func_v2i64(<2 x i64> %arg0) #0 { + store <2 x i64> %arg0, <2 x i64> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v3i64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx2 v[4:5], off +define void @void_func_v3i64(<3 x i64> %arg0) #0 { + store <3 x i64> %arg0, <3 x i64> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v4i64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +define void @void_func_v4i64(<4 x i64> %arg0) #0 { + store <4 x i64> %arg0, <4 x i64> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v5i64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx2 v[8:9], off +define void @void_func_v5i64(<5 x i64> %arg0) #0 { + store <5 x i64> %arg0, <5 x i64> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v8i64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx4 v[8:11], off +; GCN-DAG: buffer_store_dwordx4 v[12:15], off +define void @void_func_v8i64(<8 x i64> %arg0) #0 { + store <8 x i64> %arg0, <8 x i64> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v16i64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx4 v[8:11], off +; GCN-DAG: buffer_store_dwordx4 v[12:15], off +; GCN-DAG: buffer_store_dwordx4 v[16:19], off +; GCN-DAG: buffer_store_dwordx4 v[20:23], off +; GCN-DAG: buffer_store_dwordx4 v[24:27], off +; GCN-DAG: buffer_store_dwordx4 v[28:31], off +define void @void_func_v16i64(<16 x i64> %arg0) #0 { + store <16 x i64> %arg0, <16 x i64> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v2i16: +; GFX9-NOT: v0 +; GFX9: buffer_store_dword v0, off +define void @void_func_v2i16(<2 x i16> %arg0) #0 { + store <2 x i16> %arg0, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v3i16: +; GCN-DAG: buffer_store_dword v0, off +; GCN-DAG: buffer_store_short v2, off +define void @void_func_v3i16(<3 x i16> %arg0) #0 { + store <3 x i16> %arg0, <3 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v4i16: +; GFX9-NOT: v0 +; GFX9-NOT: v1 +; GFX9: buffer_store_dwordx2 v[0:1], off +define void @void_func_v4i16(<4 x i16> %arg0) #0 { + store <4 x i16> %arg0, <4 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v5i16: +; GCN-DAG: buffer_store_short v4, off, +; GCN-DAG: buffer_store_dwordx2 v[1:2], off +define void @void_func_v5i16(<5 x i16> %arg0) #0 { + store <5 x i16> %arg0, <5 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v8i16: +; GFX9-DAG: buffer_store_dwordx4 v[0:3], off +define void @void_func_v8i16(<8 x i16> %arg0) #0 { + store <8 x i16> %arg0, <8 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v16i16: +; GFX9-DAG: buffer_store_dwordx4 v[0:3], off +; GFX9-DAG: buffer_store_dwordx4 v[4:7], off +define void @void_func_v16i16(<16 x i16> %arg0) #0 { + store <16 x i16> %arg0, <16 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v2f32: +; GCN-NOT: v[0:1] +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: buffer_store_dwordx2 v[0:1], off +define void @void_func_v2f32(<2 x float> %arg0) #0 { + store <2 x float> %arg0, <2 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v3f32: +; GCN-DAG: buffer_store_dword v2, off +; GCN-DAG: buffer_store_dwordx2 v[0:1], off +define void @void_func_v3f32(<3 x float> %arg0) #0 { + store <3 x float> %arg0, <3 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v4f32: +; GCN: buffer_store_dwordx4 v[0:3], off +define void @void_func_v4f32(<4 x float> %arg0) #0 { + store <4 x float> %arg0, <4 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v8f32: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +define void @void_func_v8f32(<8 x float> %arg0) #0 { + store <8 x float> %arg0, <8 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v16f32: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx4 v[8:11], off +; GCN-DAG: buffer_store_dwordx4 v[12:15], off +define void @void_func_v16f32(<16 x float> %arg0) #0 { + store <16 x float> %arg0, <16 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v2f64: +; GCN: buffer_store_dwordx4 v[0:3], off +define void @void_func_v2f64(<2 x double> %arg0) #0 { + store <2 x double> %arg0, <2 x double> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v3f64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx2 v[4:5], off +define void @void_func_v3f64(<3 x double> %arg0) #0 { + store <3 x double> %arg0, <3 x double> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v4f64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +define void @void_func_v4f64(<4 x double> %arg0) #0 { + store <4 x double> %arg0, <4 x double> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v8f64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx4 v[8:11], off +; GCN-DAG: buffer_store_dwordx4 v[12:15], off +define void @void_func_v8f64(<8 x double> %arg0) #0 { + store <8 x double> %arg0, <8 x double> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v16f64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx4 v[8:11], off +; GCN-DAG: buffer_store_dwordx4 v[12:15], off +; GCN-DAG: buffer_store_dwordx4 v[16:19], off +; GCN-DAG: buffer_store_dwordx4 v[20:23], off +; GCN-DAG: buffer_store_dwordx4 v[24:27], off +; GCN-DAG: buffer_store_dwordx4 v[28:31], off +define void @void_func_v16f64(<16 x double> %arg0) #0 { + store <16 x double> %arg0, <16 x double> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v2f16: +; GFX9-NOT: v0 +; GFX9: buffer_store_dword v0, off +define void @void_func_v2f16(<2 x half> %arg0) #0 { + store <2 x half> %arg0, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v3f16: +; GFX9-NOT: v0 +; GCN-DAG: buffer_store_dword v0, off +; GCN-DAG: buffer_store_short v2, off +define void @void_func_v3f16(<3 x half> %arg0) #0 { + store <3 x half> %arg0, <3 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v4f16: +; GFX9-NOT: v0 +; GFX9-NOT: v1 +; GFX9-NOT: v[0:1] +; GFX9: buffer_store_dwordx2 v[0:1], off +define void @void_func_v4f16(<4 x half> %arg0) #0 { + store <4 x half> %arg0, <4 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v8f16: +; GFX9-NOT: v0 +; GFX9-NOT: v1 +; GFX9: buffer_store_dwordx4 v[0:3], off +define void @void_func_v8f16(<8 x half> %arg0) #0 { + store <8 x half> %arg0, <8 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v16f16: +; GFX9-NOT: v0 +; GFX9-NOT: v1 +; GFX9-DAG: buffer_store_dwordx4 v[0:3], off +; GFX9-DAG: buffer_store_dwordx4 v[4:7], off +define void @void_func_v16f16(<16 x half> %arg0) #0 { + store <16 x half> %arg0, <16 x half> addrspace(1)* undef + ret void +} + +; Make sure there is no alignment requirement for passed vgprs. +; GCN-LABEL: {{^}}void_func_i32_i64_i32: +; GCN-NOT: v0 +; GCN: buffer_store_dword v0, off +; GCN: buffer_store_dwordx2 v[1:2] +; GCN: buffer_store_dword v3 +define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 { + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i64 %arg1, i64 addrspace(1)* undef + store volatile i32 %arg2, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_struct_i32: +; GCN-NOT: v0 +; GCN: buffer_store_dword v0, off +define void @void_func_struct_i32({ i32 } %arg0) #0 { + store { i32 } %arg0, { i32 } addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_struct_i8_i32: +; GCN-DAG: buffer_store_byte v0, off +; GCN-DAG: buffer_store_dword v1, off +define void @void_func_struct_i8_i32({ i8, i32 } %arg0) #0 { + store { i8, i32 } %arg0, { i8, i32 } addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32: +; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: buffer_store_dword v[[ELT1]] +; GCN-DAG: buffer_store_byte v[[ELT0]] +define void @void_func_byval_struct_i8_i32({ i8, i32 }* byval %arg0) #0 { + %arg0.load = load { i8, i32 }, { i8, i32 }* %arg0 + store { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2: +; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s5{{$}} +; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s5 offset:12{{$}} + +; GCN: ds_write_b32 v0, v0 +; GCN: s_setpc_b64 +define void @void_func_byval_struct_i8_i32_x2({ i8, i32 }* byval %arg0, { i8, i32 }* byval %arg1, i32 %arg2) #0 { + %arg0.load = load volatile { i8, i32 }, { i8, i32 }* %arg0 + %arg1.load = load volatile { i8, i32 }, { i8, i32 }* %arg1 + store volatile { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef + store volatile { i8, i32 } %arg1.load, { i8, i32 } addrspace(1)* undef + store volatile i32 %arg2, i32 addrspace(3)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_byval_i32_byval_i64: +; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_load_dword v[[ARG1_LOAD0:[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s5 offset:12{{$}} +; GCN-DAG: buffer_store_dword v[[ARG0_LOAD]], off +; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ARG1_LOAD0]]:[[ARG1_LOAD1]]{{\]}}, off +define void @void_func_byval_i32_byval_i64(i32* byval %arg0, i64* byval %arg1) #0 { + %arg0.load = load i32, i32* %arg0 + %arg1.load = load i64, i64* %arg1 + store i32 %arg0.load, i32 addrspace(1)* undef + store i64 %arg1.load, i64 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v32i32_i32_i64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx4 v[8:11], off +; GCN-DAG: buffer_store_dwordx4 v[12:15], off +; GCN-DAG: buffer_store_dwordx4 v[16:19], off +; GCN-DAG: buffer_store_dwordx4 v[20:23], off +; GCN-DAG: buffer_store_dwordx4 v[24:27], off +; GCN-DAG: buffer_store_dwordx4 v[28:31], off +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:4 +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:8 + +; GCN: buffer_store_dword v[[LOAD_ARG1]] +; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off +define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 { + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile i32 %arg1, i32 addrspace(1)* undef + store volatile i64 %arg2, i64 addrspace(1)* undef + ret void +} + +; FIXME: Different ext load types on CI vs. VI +; GCN-LABEL: {{^}}void_func_v32i32_i1_i8_i16: +; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s5{{$}} +; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s5 offset:12{{$}} + +; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s5 offset:12{{$}} + +; GCN-DAG: v_and_b32_e32 [[TRUNC_ARG1_I1:v[0-9]+]], 1, [[LOAD_ARG1]] +; CI-DAG: v_cvt_f16_f32_e32 [[CVT_ARG4:v[0-9]+]], [[LOAD_ARG4]] + +; GCN: buffer_store_byte [[TRUNC_ARG1_I1]], off +; GCN: buffer_store_byte [[LOAD_ARG2]], off +; GCN: buffer_store_short [[LOAD_ARG3]], off +; VI: buffer_store_short [[LOAD_ARG4]], off + +; CI: buffer_store_short [[CVT_ARG4]], off +define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i16 %arg3, half %arg4) #0 { + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile i1 %arg1, i1 addrspace(1)* undef + store volatile i8 %arg2, i8 addrspace(1)* undef + store volatile i16 %arg3, i16 addrspace(1)* undef + store volatile half %arg4, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v32i32_v2i32_v2f32: +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:12{{$}} + +; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]{{\]}}, off +; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off +define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 { + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile <2 x i32> %arg1, <2 x i32> addrspace(1)* undef + store volatile <2 x float> %arg2, <2 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v32i32_v2i16_v2f16: +; GFX9-DAG: buffer_load_dword [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s5{{$}} +; GFX9-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GFX9: buffer_store_dword [[LOAD_ARG1]], off +; GFX9: buffer_store_short [[LOAD_ARG2]], off +define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2) #0 { + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile <2 x i16> %arg1, <2 x i16> addrspace(1)* undef + store volatile <2 x half> %arg2, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v32i32_v2i64_v2f64: +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}} + +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:28{{$}} + +; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off +; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off +define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 { + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile <2 x i64> %arg1, <2 x i64> addrspace(1)* undef + store volatile <2 x double> %arg2, <2 x double> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v32i32_v4i32_v4f32: +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}} + +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:28{{$}} + +; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off +; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off +define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 { + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile <4 x i32> %arg1, <4 x i32> addrspace(1)* undef + store volatile <4 x float> %arg2, <4 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v32i32_v8i32_v8f32: +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s5 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s5 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s5 offset:28{{$}} + +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:36{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:40{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:44{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s5 offset:48{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s5 offset:52{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s5 offset:56{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s5 offset:60{{$}} + +; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_4]]:[[LOAD_ARG1_7]]{{\]}}, off +; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off +; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_4]]:[[LOAD_ARG2_7]]{{\]}}, off +; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off +define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 x float> %arg2) #0 { + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile <8 x i32> %arg1, <8 x i32> addrspace(1)* undef + store volatile <8 x float> %arg2, <8 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v32i32_v16i32_v16f32: +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s5 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s5 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s5 offset:28{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s5 offset:32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s5 offset:36{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s5 offset:40{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s5 offset:44{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s5 offset:48{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s5 offset:52{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s5 offset:56{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_15:[0-9]+]], off, s[0:3], s5 offset:60{{$}} + +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:64{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:68{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:72{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:76{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s5 offset:80{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s5 offset:84{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s5 offset:88{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s5 offset:92{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s5 offset:96{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s5 offset:100{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s5 offset:104{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s5 offset:108{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s5 offset:112{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s5 offset:116{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s5 offset:120{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s5 offset:124{{$}} +define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 { + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile <16 x i32> %arg1, <16 x i32> addrspace(1)* undef + store volatile <16 x float> %arg2, <16 x float> addrspace(1)* undef + ret void +} + +; Check there is no crash. +; GCN-LABEL: {{^}}void_func_v16i8: +define void @void_func_v16i8(<16 x i8> %arg0) #0 { + store volatile <16 x i8> %arg0, <16 x i8> addrspace(1)* undef + ret void +} + +; Check there is no crash. +; GCN-LABEL: {{^}}void_func_v32i32_v16i8: +define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile <16 x i8> %arg1, <16 x i8> addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/function-returns.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/function-returns.ll @@ -0,0 +1,514 @@ +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}i1_func_void: +; GCN: buffer_load_ubyte v0, off +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define i1 @i1_func_void() #0 { + %val = load i1, i1 addrspace(1)* undef + ret i1 %val +} + +; FIXME: Missing and? +; GCN-LABEL: {{^}}i1_zeroext_func_void: +; GCN: buffer_load_ubyte v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define zeroext i1 @i1_zeroext_func_void() #0 { + %val = load i1, i1 addrspace(1)* undef + ret i1 %val +} + +; GCN-LABEL: {{^}}i1_signext_func_void: +; GCN: buffer_load_ubyte v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1{{$}} +; GCN-NEXT: s_setpc_b64 +define signext i1 @i1_signext_func_void() #0 { + %val = load i1, i1 addrspace(1)* undef + ret i1 %val +} + +; GCN-LABEL: {{^}}i8_func_void: +; GCN: buffer_load_ubyte v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define i8 @i8_func_void() #0 { + %val = load i8, i8 addrspace(1)* undef + ret i8 %val +} + +; GCN-LABEL: {{^}}i8_zeroext_func_void: +; GCN: buffer_load_ubyte v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define zeroext i8 @i8_zeroext_func_void() #0 { + %val = load i8, i8 addrspace(1)* undef + ret i8 %val +} + +; GCN-LABEL: {{^}}i8_signext_func_void: +; GCN: buffer_load_sbyte v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define signext i8 @i8_signext_func_void() #0 { + %val = load i8, i8 addrspace(1)* undef + ret i8 %val +} + +; GCN-LABEL: {{^}}i16_func_void: +; GCN: buffer_load_ushort v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define i16 @i16_func_void() #0 { + %val = load i16, i16 addrspace(1)* undef + ret i16 %val +} + +; GCN-LABEL: {{^}}i16_zeroext_func_void: +; GCN: buffer_load_ushort v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define zeroext i16 @i16_zeroext_func_void() #0 { + %val = load i16, i16 addrspace(1)* undef + ret i16 %val +} + +; GCN-LABEL: {{^}}i16_signext_func_void: +; GCN: buffer_load_sshort v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define signext i16 @i16_signext_func_void() #0 { + %val = load i16, i16 addrspace(1)* undef + ret i16 %val +} + +; GCN-LABEL: {{^}}i32_func_void: +; GCN: buffer_load_dword v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define i32 @i32_func_void() #0 { + %val = load i32, i32 addrspace(1)* undef + ret i32 %val +} + +; GCN-LABEL: {{^}}i64_func_void: +; GCN: buffer_load_dwordx2 v[0:1], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define i64 @i64_func_void() #0 { + %val = load i64, i64 addrspace(1)* undef + ret i64 %val +} + +; GCN-LABEL: {{^}}f32_func_void: +; GCN: buffer_load_dword v0, off, s[8:11], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define float @f32_func_void() #0 { + %val = load float, float addrspace(1)* undef + ret float %val +} + +; GCN-LABEL: {{^}}f64_func_void: +; GCN: buffer_load_dwordx2 v[0:1], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define double @f64_func_void() #0 { + %val = load double, double addrspace(1)* undef + ret double %val +} + +; GCN-LABEL: {{^}}v2i32_func_void: +; GCN: buffer_load_dwordx2 v[0:1], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <2 x i32> @v2i32_func_void() #0 { + %val = load <2 x i32>, <2 x i32> addrspace(1)* undef + ret <2 x i32> %val +} + +; GCN-LABEL: {{^}}v3i32_func_void: +; GCN: buffer_load_dwordx4 v[0:3], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <3 x i32> @v3i32_func_void() #0 { + %val = load <3 x i32>, <3 x i32> addrspace(1)* undef + ret <3 x i32> %val +} + +; GCN-LABEL: {{^}}v4i32_func_void: +; GCN: buffer_load_dwordx4 v[0:3], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <4 x i32> @v4i32_func_void() #0 { + %val = load <4 x i32>, <4 x i32> addrspace(1)* undef + ret <4 x i32> %val +} + +; GCN-LABEL: {{^}}v5i32_func_void: +; GCN-DAG: buffer_load_dword v4, off +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <5 x i32> @v5i32_func_void() #0 { + %val = load volatile <5 x i32>, <5 x i32> addrspace(1)* undef + ret <5 x i32> %val +} + +; GCN-LABEL: {{^}}v8i32_func_void: +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <8 x i32> @v8i32_func_void() #0 { + %ptr = load volatile <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(2)* undef + %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr + ret <8 x i32> %val +} + +; GCN-LABEL: {{^}}v16i32_func_void: +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN-DAG: buffer_load_dwordx4 v[8:11], off +; GCN-DAG: buffer_load_dwordx4 v[12:15], off +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <16 x i32> @v16i32_func_void() #0 { + %ptr = load volatile <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(2)* undef + %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr + ret <16 x i32> %val +} + +; GCN-LABEL: {{^}}v32i32_func_void: +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN-DAG: buffer_load_dwordx4 v[8:11], off +; GCN-DAG: buffer_load_dwordx4 v[12:15], off +; GCN-DAG: buffer_load_dwordx4 v[16:19], off +; GCN-DAG: buffer_load_dwordx4 v[20:23], off +; GCN-DAG: buffer_load_dwordx4 v[24:27], off +; GCN-DAG: buffer_load_dwordx4 v[28:31], off +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <32 x i32> @v32i32_func_void() #0 { + %ptr = load volatile <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef + %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr + ret <32 x i32> %val +} + +; GCN-LABEL: {{^}}v2i64_func_void: +; GCN: buffer_load_dwordx4 v[0:3], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <2 x i64> @v2i64_func_void() #0 { + %val = load <2 x i64>, <2 x i64> addrspace(1)* undef + ret <2 x i64> %val +} + +; GCN-LABEL: {{^}}v3i64_func_void: +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <3 x i64> @v3i64_func_void() #0 { + %ptr = load volatile <3 x i64> addrspace(1)*, <3 x i64> addrspace(1)* addrspace(2)* undef + %val = load <3 x i64>, <3 x i64> addrspace(1)* %ptr + ret <3 x i64> %val +} + +; GCN-LABEL: {{^}}v4i64_func_void: +; GCN: buffer_load_dwordx4 v[0:3], off +; GCN: buffer_load_dwordx4 v[4:7], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <4 x i64> @v4i64_func_void() #0 { + %ptr = load volatile <4 x i64> addrspace(1)*, <4 x i64> addrspace(1)* addrspace(2)* undef + %val = load <4 x i64>, <4 x i64> addrspace(1)* %ptr + ret <4 x i64> %val +} + +; GCN-LABEL: {{^}}v5i64_func_void: +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN-DAG: buffer_load_dwordx4 v[8:11], off +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <5 x i64> @v5i64_func_void() #0 { + %ptr = load volatile <5 x i64> addrspace(1)*, <5 x i64> addrspace(1)* addrspace(2)* undef + %val = load <5 x i64>, <5 x i64> addrspace(1)* %ptr + ret <5 x i64> %val +} + +; GCN-LABEL: {{^}}v8i64_func_void: +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN-DAG: buffer_load_dwordx4 v[8:11], off +; GCN-DAG: buffer_load_dwordx4 v[12:15], off +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <8 x i64> @v8i64_func_void() #0 { + %ptr = load volatile <8 x i64> addrspace(1)*, <8 x i64> addrspace(1)* addrspace(2)* undef + %val = load <8 x i64>, <8 x i64> addrspace(1)* %ptr + ret <8 x i64> %val +} + +; GCN-LABEL: {{^}}v16i64_func_void: +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN-DAG: buffer_load_dwordx4 v[8:11], off +; GCN-DAG: buffer_load_dwordx4 v[12:15], off +; GCN-DAG: buffer_load_dwordx4 v[16:19], off +; GCN-DAG: buffer_load_dwordx4 v[20:23], off +; GCN-DAG: buffer_load_dwordx4 v[24:27], off +; GCN-DAG: buffer_load_dwordx4 v[28:31], off +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <16 x i64> @v16i64_func_void() #0 { + %ptr = load volatile <16 x i64> addrspace(1)*, <16 x i64> addrspace(1)* addrspace(2)* undef + %val = load <16 x i64>, <16 x i64> addrspace(1)* %ptr + ret <16 x i64> %val +} + +; GCN-LABEL: {{^}}v2i16_func_void: +; GFX9: buffer_load_dword v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 +define <2 x i16> @v2i16_func_void() #0 { + %val = load <2 x i16>, <2 x i16> addrspace(1)* undef + ret <2 x i16> %val +} + +; GCN-LABEL: {{^}}v3i16_func_void: +; GFX9: buffer_load_dwordx2 v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 +define <3 x i16> @v3i16_func_void() #0 { + %val = load <3 x i16>, <3 x i16> addrspace(1)* undef + ret <3 x i16> %val +} + +; GCN-LABEL: {{^}}v4i16_func_void: +; GFX9: buffer_load_dwordx2 v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 +define <4 x i16> @v4i16_func_void() #0 { + %val = load <4 x i16>, <4 x i16> addrspace(1)* undef + ret <4 x i16> %val +} + +; FIXME: Should not scalarize +; GCN-LABEL: {{^}}v5i16_func_void: +; GFX9: buffer_load_dwordx2 v[0:1] +; GFX9: buffer_load_ushort v4 +; GFX9: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9: v_mov_b32_e32 v2, v1 +; GFX9: v_lshrrev_b32_e32 v3, 16, v0 +; GCN: s_setpc_b64 +define <5 x i16> @v5i16_func_void() #0 { + %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(2)* undef + %val = load <5 x i16>, <5 x i16> addrspace(1)* %ptr + ret <5 x i16> %val +} + +; GCN-LABEL: {{^}}v8i16_func_void: +; GFX9-DAG: buffer_load_dwordx4 v[0:3], off +; GFX9: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 +define <8 x i16> @v8i16_func_void() #0 { + %ptr = load volatile <8 x i16> addrspace(1)*, <8 x i16> addrspace(1)* addrspace(2)* undef + %val = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + ret <8 x i16> %val +} + +; GCN-LABEL: {{^}}v16i16_func_void: +; GFX9: buffer_load_dwordx4 v[0:3], off +; GFX9: buffer_load_dwordx4 v[4:7], off +; GFX9: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 +define <16 x i16> @v16i16_func_void() #0 { + %ptr = load volatile <16 x i16> addrspace(1)*, <16 x i16> addrspace(1)* addrspace(2)* undef + %val = load <16 x i16>, <16 x i16> addrspace(1)* %ptr + ret <16 x i16> %val +} + +; FIXME: Should pack +; GCN-LABEL: {{^}}v16i8_func_void: +; GCN-DAG: v12 +; GCN-DAG: v13 +; GCN-DAG: v14 +; GCN-DAG: v15 +define <16 x i8> @v16i8_func_void() #0 { + %ptr = load volatile <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(2)* undef + %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + ret <16 x i8> %val +} + +; FIXME: Should pack +; GCN-LABEL: {{^}}v4i8_func_void: +; GCN: buffer_load_dword v0 +; GCN-DAG: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-DAG: v_lshrrev_b32_e32 v3, 24, v0 +; CI-DAG: v_bfe_u32 v1, v0, 8, 8 +; VI-DAG: v_lshrrev_b16_e32 v1, 8, v0 +; GCN: s_setpc_b64 +define <4 x i8> @v4i8_func_void() #0 { + %ptr = load volatile <4 x i8> addrspace(1)*, <4 x i8> addrspace(1)* addrspace(2)* undef + %val = load <4 x i8>, <4 x i8> addrspace(1)* %ptr + ret <4 x i8> %val +} + +; GCN-LABEL: {{^}}struct_i8_i32_func_void: +; GCN-DAG: buffer_load_dword v1 +; GCN-DAG: buffer_load_ubyte v0 +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define {i8, i32} @struct_i8_i32_func_void() #0 { + %val = load { i8, i32 }, { i8, i32 } addrspace(1)* undef + ret { i8, i32 } %val +} + +; GCN-LABEL: {{^}}void_func_sret_struct_i8_i32: +; GCN: buffer_load_ubyte [[VAL0:v[0-9]+]] +; GCN: buffer_load_dword [[VAL1:v[0-9]+]] +; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], s4 offen{{$}} +; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], s4 offen offset:4{{$}} +define void @void_func_sret_struct_i8_i32({ i8, i32 }* sret %arg0) #0 { + %val0 = load volatile i8, i8 addrspace(1)* undef + %val1 = load volatile i32, i32 addrspace(1)* undef + %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0 + %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1 + store i8 %val0, i8* %gep0 + store i32 %val1, i32* %gep1 + ret void +} + +; GCN-LABEL: {{^}}v33i32_func_void: +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}} +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <33 x i32> @v33i32_func_void() #0 { + %ptr = load volatile <33 x i32> addrspace(1)*, <33 x i32> addrspace(1)* addrspace(2)* undef + %val = load <33 x i32>, <33 x i32> addrspace(1)* %ptr + ret <33 x i32> %val +} + +; GCN-LABEL: {{^}}struct_v32i32_i32_func_void: +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}} +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { + %ptr = load volatile { <32 x i32>, i32 } addrspace(1)*, { <32 x i32>, i32 } addrspace(1)* addrspace(2)* undef + %val = load { <32 x i32>, i32 }, { <32 x i32>, i32 } addrspace(1)* %ptr + ret { <32 x i32>, i32 }%val +} + +; GCN-LABEL: {{^}}struct_i32_v32i32_func_void: +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:132{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:136{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:140{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:144{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:148{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:152{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:156{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:160{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:164{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:168{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:172{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:176{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:180{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:184{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:188{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:192{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:196{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:200{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:204{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:208{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:212{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:216{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:220{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:224{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:228{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:232{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:236{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:240{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:244{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:248{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:252{{$}} +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { + %ptr = load volatile { i32, <32 x i32> } addrspace(1)*, { i32, <32 x i32> } addrspace(1)* addrspace(2)* undef + %val = load { i32, <32 x i32> }, { i32, <32 x i32> } addrspace(1)* %ptr + ret { i32, <32 x i32> }%val +} + +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/hsa-func.ll =================================================================== --- test/CodeGen/AMDGPU/hsa-func.ll +++ test/CodeGen/AMDGPU/hsa-func.ll @@ -27,7 +27,7 @@ ; ELF: Symbol { ; ELF: Name: simple -; ELF: Size: 44 +; ELF: Size: 48 ; ELF: Type: Function (0x2) ; ELF: } @@ -41,14 +41,12 @@ ; HSA: .p2align 2 ; HSA: {{^}}simple: ; HSA-NOT: amd_kernel_code_t - -; FIXME: Check this isn't a kernarg load when calling convention implemented. -; XHSA-NOT: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 +; HSA-NOT: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 ; Make sure we are setting the ATC bit: -; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000 +; HSA-CI: s_mov_b32 s[[HI:[0-9]+]], 0x100f000 ; On VI+ we also need to set MTYPE = 2 -; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000 +; HSA-VI: s_mov_b32 s[[HI:[0-9]+]], 0x1100f000 ; Make sure we generate flat store for HSA ; HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} @@ -56,8 +54,9 @@ ; HSA: .size simple, .Lfunc_end0-simple ; HSA: ; Function info: ; HSA-NOT: COMPUTE_PGM_RSRC2 -define void @simple(i32 addrspace(1)* %out) { +define void @simple(i32 addrspace(1)* addrspace(2)* %ptr.out) { entry: + %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out store i32 0, i32 addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/inline-asm.ll =================================================================== --- test/CodeGen/AMDGPU/inline-asm.ll +++ test/CodeGen/AMDGPU/inline-asm.ll @@ -191,7 +191,7 @@ ; CHECK: v_mov_b32_e32 v0, s0 ; CHECK: v_mov_b32_e32 v1, s1 ; CHECK: use v[0:1] -define void @i64_imm_input_phys_vgpr() { +define amdgpu_kernel void @i64_imm_input_phys_vgpr() { entry: call void asm sideeffect "; use $0 ", "{VGPR0_VGPR1}"(i64 123456) ret void Index: test/CodeGen/AMDGPU/subreg_interference.mir =================================================================== --- test/CodeGen/AMDGPU/subreg_interference.mir +++ test/CodeGen/AMDGPU/subreg_interference.mir @@ -1,4 +1,12 @@ # RUN: llc -o - %s -mtriple=amdgcn--amdhsa -verify-machineinstrs -run-pass=greedy,virtregrewriter | FileCheck %s +--- | + + define amdgpu_kernel void @func0() { + ret void + } + +... + --- # We should not detect any interference between v0/v1 here and only allocate # sgpr0-sgpr3.