diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -6968,6 +6968,25 @@ the top 32 bits of the pipeline, so the shader may use the program counter's top 32 bits. +.. _pal_call-convention: + +Call Convention +~~~~~~~~~~~~~~~ + +For graphics use cases, the calling convention is `amdgpu_gfx`. + +.. note:: + + `amdgpu_gfx` Function calls are currently in development and are + subject to major changes. + +This calling convention shares most properties with calling non-kernel +functions (see +:ref:`amdgpu-amdhsa-function-call-convention-non-kernel-functions`). +Differences are: + + - Currently there are none, differences will be listed here + Unspecified OS -------------- diff --git a/llvm/include/llvm/IR/CallingConv.h b/llvm/include/llvm/IR/CallingConv.h --- a/llvm/include/llvm/IR/CallingConv.h +++ b/llvm/include/llvm/IR/CallingConv.h @@ -241,6 +241,9 @@ /// The remainder matches the regular calling convention. WASM_EmscriptenInvoke = 99, + /// Calling convention used for AMD graphics targets. + AMDGPU_Gfx = 100, + /// The highest possible calling convention ID. Must be some 2^k - 1. MaxID = 1023 }; diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -624,6 +624,7 @@ KEYWORD(amdgpu_ps); KEYWORD(amdgpu_cs); KEYWORD(amdgpu_kernel); + KEYWORD(amdgpu_gfx); KEYWORD(tailcc); KEYWORD(cc); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -2086,6 +2086,9 @@ case lltok::kw_hhvm_ccc: CC = CallingConv::HHVM_C; break; case lltok::kw_cxx_fast_tlscc: CC = CallingConv::CXX_FAST_TLS; break; case lltok::kw_amdgpu_vs: CC = CallingConv::AMDGPU_VS; break; + case lltok::kw_amdgpu_gfx: + CC = CallingConv::AMDGPU_Gfx; + break; case lltok::kw_amdgpu_ls: CC = CallingConv::AMDGPU_LS; break; case lltok::kw_amdgpu_hs: CC = CallingConv::AMDGPU_HS; break; case lltok::kw_amdgpu_es: CC = CallingConv::AMDGPU_ES; break; diff --git a/llvm/lib/AsmParser/LLToken.h b/llvm/lib/AsmParser/LLToken.h --- a/llvm/lib/AsmParser/LLToken.h +++ b/llvm/lib/AsmParser/LLToken.h @@ -170,6 +170,7 @@ kw_amdgpu_ps, kw_amdgpu_cs, kw_amdgpu_kernel, + kw_amdgpu_gfx, kw_tailcc, // Attributes: diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -388,6 +388,9 @@ case CallingConv::AMDGPU_PS: Out << "amdgpu_ps"; break; case CallingConv::AMDGPU_CS: Out << "amdgpu_cs"; break; case CallingConv::AMDGPU_KERNEL: Out << "amdgpu_kernel"; break; + case CallingConv::AMDGPU_Gfx: + Out << "amdgpu_gfx"; + break; } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -456,7 +456,7 @@ Info = analyzeResourceUsage(MF); } - if (STM.isAmdPalOS()) + if (STM.isAmdPalOS() && MFI->isEntryFunction()) EmitPALMetadata(MF, CurrentProgramInfo); else if (!STM.isAmdHsaOS()) { EmitProgramInfoSI(MF, CurrentProgramInfo); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -459,9 +459,9 @@ assert(!Val == VRegs.empty() && "Return value without a vreg"); CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); - const bool IsShader = AMDGPU::isShader(CC); - const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) || - AMDGPU::isKernel(CC); + const bool IsGraphics = AMDGPU::isGraphics(CC); + const bool IsWaveEnd = + (IsGraphics && MFI->returnsVoid()) || AMDGPU::isKernel(CC); if (IsWaveEnd) { B.buildInstr(AMDGPU::S_ENDPGM) .addImm(0); @@ -471,7 +471,7 @@ auto const &ST = MF.getSubtarget(); unsigned ReturnOpc = - IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return; + IsGraphics ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return; auto Ret = B.buildInstrNoInsert(ReturnOpc); Register ReturnAddrVReg; @@ -785,7 +785,7 @@ if (CC == CallingConv::AMDGPU_KERNEL) return lowerFormalArgumentsKernel(B, F, VRegs); - const bool IsShader = AMDGPU::isShader(CC); + const bool IsGraphics = AMDGPU::isGraphics(CC); const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC); MachineFunction &MF = B.getMF(); @@ -826,7 +826,7 @@ const bool InReg = Arg.hasAttribute(Attribute::InReg); // SGPR arguments to functions not implemented. - if (!IsShader && InReg) + if (!IsGraphics && InReg) return false; if (Arg.hasAttribute(Attribute::SwiftSelf) || @@ -937,7 +937,7 @@ // Start adding system SGPRs. if (IsEntryFunc) { - TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader); + TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics); } else { CCInfo.AllocateReg(Info->getScratchRSrcReg()); TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -16,7 +16,75 @@ : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>; // Calling convention for SI -def CC_SI : CallingConv<[ +def CC_SI_Gfx : CallingConv<[ + // 0-3 are reserved for the stack buffer descriptor + // 30-31 are reserved for the return address + // 32 is reserved for the stack pointer + CCIfInReg>>, + + CCIfNotInReg>>, + + CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>, + CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>, + CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>, + CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>, + CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>, + CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>, + CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>> +]>; + +def RetCC_SI_Gfx : CallingConv<[ + // 0-3 are reserved for the stack buffer descriptor + // 32 is reserved for the stack pointer + CCIfInReg>>, + + CCIfNotInReg>>, + + CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>, + CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>, + CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>, + CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>, + CCIfType<[v5i32, v5f32], CCAssignToStack<20, 4>>, + CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>, + CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>> +]>; + +def CC_SI_SHADER : CallingConv<[ CCIfInReg" "(State.getMachineFunction().getSubtarget()).getGeneration() >= " "AMDGPUSubtarget::SOUTHERN_ISLANDS", - CCDelegateTo>, + CCDelegateTo>, CCIf<"static_cast" "(State.getMachineFunction().getSubtarget()).getGeneration() >= " "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -939,6 +939,8 @@ case CallingConv::Fast: case CallingConv::Cold: return CC_AMDGPU_Func; + case CallingConv::AMDGPU_Gfx: + return CC_SI_Gfx; case CallingConv::AMDGPU_KERNEL: case CallingConv::SPIR_KERNEL: default: @@ -960,6 +962,8 @@ case CallingConv::AMDGPU_ES: case CallingConv::AMDGPU_LS: return RetCC_SI_Shader; + case CallingConv::AMDGPU_Gfx: + return RetCC_SI_Gfx; case CallingConv::C: case CallingConv::Fast: case CallingConv::Cold: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -76,7 +76,7 @@ const GCNSubtarget *ST; const SITargetLowering *TLI; AMDGPUTTIImpl CommonTTI; - bool IsGraphicsShader; + bool IsGraphics; bool HasFP32Denormals; bool HasFP64FP16Denormals; unsigned MaxVGPRs; @@ -137,7 +137,7 @@ : BaseT(TM, F.getParent()->getDataLayout()), ST(static_cast(TM->getSubtargetImpl(F))), TLI(ST->getTargetLowering()), CommonTTI(TM, F), - IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())), + IsGraphics(AMDGPU::isGraphics(F.getCallingConv())), MaxVGPRs(ST->getMaxNumVGPRs( std::max(ST->getWavesPerEU(F).first, ST->getWavesPerEUForWorkGroup( @@ -217,7 +217,7 @@ unsigned getFlatAddressSpace() const { // Don't bother running InferAddressSpaces pass on graphics shaders which // don't use flat addressing. - if (IsGraphicsShader) + if (IsGraphics) return -1; return AMDGPUAS::FLAT_ADDRESS; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -722,31 +722,6 @@ } } -static bool isArgPassedInSGPR(const Argument *A) { - const Function *F = A->getParent(); - - // Arguments to compute shaders are never a source of divergence. - CallingConv::ID CC = F->getCallingConv(); - switch (CC) { - case CallingConv::AMDGPU_KERNEL: - case CallingConv::SPIR_KERNEL: - return true; - case CallingConv::AMDGPU_VS: - case CallingConv::AMDGPU_LS: - case CallingConv::AMDGPU_HS: - case CallingConv::AMDGPU_ES: - case CallingConv::AMDGPU_GS: - case CallingConv::AMDGPU_PS: - case CallingConv::AMDGPU_CS: - // For non-compute shaders, SGPR inputs are marked with either inreg. - // Everything else is in VGPRs. - return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg); - default: - // TODO: Should calls support inreg for SGPR inputs? - return false; - } -} - /// Analyze if the results of inline asm are divergent. If \p Indices is empty, /// this is analyzing the collective result of all output registers. Otherwise, /// this is only querying a specific result index if this returns multiple @@ -803,7 +778,7 @@ /// different across workitems in a wavefront. bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { if (const Argument *A = dyn_cast(V)) - return !isArgPassedInSGPR(A); + return !AMDGPU::isArgPassedInSGPR(A); // Loads from the private and flat address spaces are divergent, because // threads can execute the load instruction with the same inputs and get diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1780,12 +1780,11 @@ return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT); } -static void processShaderInputArgs(SmallVectorImpl &Splits, - CallingConv::ID CallConv, - ArrayRef Ins, - BitVector &Skipped, - FunctionType *FType, - SIMachineFunctionInfo *Info) { +static void processPSInputArgs(SmallVectorImpl &Splits, + CallingConv::ID CallConv, + ArrayRef Ins, BitVector &Skipped, + FunctionType *FType, + SIMachineFunctionInfo *Info) { for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { const ISD::InputArg *Arg = &Ins[I]; @@ -2237,7 +2236,7 @@ FunctionType *FType = MF.getFunction().getFunctionType(); SIMachineFunctionInfo *Info = MF.getInfo(); - if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { + if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) { DiagnosticInfoUnsupported NoGraphicsHSA( Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); DAG.getContext()->diagnose(NoGraphicsHSA); @@ -2250,12 +2249,20 @@ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); - bool IsShader = AMDGPU::isShader(CallConv); + bool IsGraphics = AMDGPU::isGraphics(CallConv); bool IsKernel = AMDGPU::isKernel(CallConv); bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv); - if (IsShader) { - processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); + if (IsGraphics) { + assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && + !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && + !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && + !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && + !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()); + } + + if (CallConv == CallingConv::AMDGPU_PS) { + processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); // At least one interpolation mode must be enabled or else the GPU will // hang. @@ -2270,39 +2277,28 @@ // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be // enabled too. - if (CallConv == CallingConv::AMDGPU_PS) { - if ((Info->getPSInputAddr() & 0x7F) == 0 || - ((Info->getPSInputAddr() & 0xF) == 0 && - Info->isPSInputAllocated(11))) { - CCInfo.AllocateReg(AMDGPU::VGPR0); - CCInfo.AllocateReg(AMDGPU::VGPR1); - Info->markPSInputAllocated(0); - Info->markPSInputEnabled(0); - } - if (Subtarget->isAmdPalOS()) { - // For isAmdPalOS, the user does not enable some bits after compilation - // based on run-time states; the register values being generated here are - // the final ones set in hardware. Therefore we need to apply the - // workaround to PSInputAddr and PSInputEnable together. (The case where - // a bit is set in PSInputAddr but not PSInputEnable is where the - // frontend set up an input arg for a particular interpolation mode, but - // nothing uses that input arg. Really we should have an earlier pass - // that removes such an arg.) - unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); - if ((PsInputBits & 0x7F) == 0 || - ((PsInputBits & 0xF) == 0 && - (PsInputBits >> 11 & 1))) - Info->markPSInputEnabled( - countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); - } + if ((Info->getPSInputAddr() & 0x7F) == 0 || + ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) { + CCInfo.AllocateReg(AMDGPU::VGPR0); + CCInfo.AllocateReg(AMDGPU::VGPR1); + Info->markPSInputAllocated(0); + Info->markPSInputEnabled(0); + } + if (Subtarget->isAmdPalOS()) { + // For isAmdPalOS, the user does not enable some bits after compilation + // based on run-time states; the register values being generated here are + // the final ones set in hardware. Therefore we need to apply the + // workaround to PSInputAddr and PSInputEnable together. (The case where + // a bit is set in PSInputAddr but not PSInputEnable is where the + // frontend set up an input arg for a particular interpolation mode, but + // nothing uses that input arg. Really we should have an earlier pass + // that removes such an arg.) + unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); + if ((PsInputBits & 0x7F) == 0 || + ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1))) + Info->markPSInputEnabled( + countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); } - - assert(!Info->hasDispatchPtr() && - !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && - !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && - !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && - !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && - !Info->hasWorkItemIDZ()); } else if (IsKernel) { assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); } else { @@ -2449,7 +2445,7 @@ // Start adding system SGPRs. if (IsEntryFunc) { - allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader); + allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics); } else { CCInfo.AllocateReg(Info->getScratchRSrcReg()); allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); @@ -2932,7 +2928,7 @@ report_fatal_error("unsupported libcall legalization"); if (!AMDGPUTargetMachine::EnableFixedFunctionABI && - !CLI.CB->getCalledFunction()) { + !CLI.CB->getCalledFunction() && CallConv != CallingConv::AMDGPU_Gfx) { return lowerUnhandledCall(CLI, InVals, "unsupported indirect call to function "); } @@ -2942,11 +2938,11 @@ "unsupported required tail call to function "); } - if (AMDGPU::isShader(MF.getFunction().getCallingConv())) { - // Note the issue is with the CC of the calling function, not of the call + if (AMDGPU::isShader(CallConv)) { + // Note the issue is with the CC of the called function, not of the call // itself. return lowerUnhandledCall(CLI, InVals, - "unsupported call from graphics shader of function "); + "unsupported call to a shader function "); } if (IsTailCall) { @@ -2977,7 +2973,8 @@ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); - if (AMDGPUTargetMachine::EnableFixedFunctionABI) { + if (AMDGPUTargetMachine::EnableFixedFunctionABI && + CallConv != CallingConv::AMDGPU_Gfx) { // With a fixed ABI, allocate fixed registers before user arguments. passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); } @@ -3111,7 +3108,8 @@ } } - if (!AMDGPUTargetMachine::EnableFixedFunctionABI) { + if (!AMDGPUTargetMachine::EnableFixedFunctionABI && + CallConv != CallingConv::AMDGPU_Gfx) { // Copy special input registers after user input arguments. passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5087,8 +5087,7 @@ // scratch memory access. In both cases, the legalization never involves // conversion to the addr64 form. if (isMIMG(MI) || - (AMDGPU::isShader(MF.getFunction().getCallingConv()) && - (isMUBUF(MI) || isMTBUF(MI)))) { + ((ST.isAmdPalOS() || ST.isMesa3DOS()) && (isMUBUF(MI) || isMTBUF(MI)))) { MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT); diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -276,6 +276,7 @@ FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, Index); for (MachineBasicBlock &MBB : MF) { + assert(LowestAvailableVGPR.isValid() && "Did not find an available VGPR"); MBB.addLiveIn(LowestAvailableVGPR); MBB.sortUniqueLiveIns(); } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -125,6 +125,7 @@ case CallingConv::C: case CallingConv::Fast: case CallingConv::Cold: + case CallingConv::AMDGPU_Gfx: return CSR_AMDGPU_HighRegs_SaveList; default: { // Dummy to not crash RegisterClassInfo. @@ -145,6 +146,7 @@ case CallingConv::C: case CallingConv::Fast: case CallingConv::Cold: + case CallingConv::AMDGPU_Gfx: return CSR_AMDGPU_HighRegs_RegMask; default: return nullptr; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -567,6 +567,9 @@ LLVM_READNONE bool isShader(CallingConv::ID CC); +LLVM_READNONE +bool isGraphics(CallingConv::ID CC); + LLVM_READNONE bool isCompute(CallingConv::ID CC); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1043,8 +1043,12 @@ } } +bool isGraphics(CallingConv::ID cc) { + return isShader(cc) || cc == CallingConv::AMDGPU_Gfx; +} + bool isCompute(CallingConv::ID cc) { - return !isShader(cc) || cc == CallingConv::AMDGPU_CS; + return !isGraphics(cc) || cc == CallingConv::AMDGPU_CS; } bool isEntryFunctionCC(CallingConv::ID CC) { @@ -1439,6 +1443,7 @@ case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_Gfx: // For non-compute shaders, SGPR inputs are marked with either inreg or byval. // Everything else is in VGPRs. return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) || diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -733,6 +733,8 @@ return ".hs"; case CallingConv::AMDGPU_LS: return ".ls"; + case CallingConv::AMDGPU_Gfx: + llvm_unreachable("Callable shader has no hardware stage"); default: return ".cs"; } diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll --- a/llvm/test/Bitcode/compatibility.ll +++ b/llvm/test/Bitcode/compatibility.ll @@ -478,6 +478,8 @@ ; CHECK: declare amdgpu_cs void @f.cc90() declare amdgpu_cs void @f.amdgpu_cs() ; CHECK: declare amdgpu_cs void @f.amdgpu_cs() +declare amdgpu_gfx void @f.amdgpu_gfx() +; CHECK: declare amdgpu_gfx void @f.amdgpu_gfx() declare cc91 void @f.cc91() ; CHECK: declare amdgpu_kernel void @f.cc91() declare amdgpu_kernel void @f.amdgpu_kernel() diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -0,0 +1,10 @@ +; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -enable-var-scope %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s + +; GCN-LABEL: {{^}}gfx_callable_amdpal: +; GCN: .amd_amdgpu_pal_metadata {{$}} +define amdgpu_gfx half @gfx_callable_amdpal(half %arg0) { + %add = fadd half %arg0, 1.0 + ret half %add +} diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -0,0 +1,6566 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s + +declare hidden amdgpu_gfx void @external_void_func_i1(i1) #0 +declare hidden amdgpu_gfx void @external_void_func_i1_signext(i1 signext) #0 +declare hidden amdgpu_gfx void @external_void_func_i1_zeroext(i1 zeroext) #0 + +declare hidden amdgpu_gfx void @external_void_func_i8(i8) #0 +declare hidden amdgpu_gfx void @external_void_func_i8_signext(i8 signext) #0 +declare hidden amdgpu_gfx void @external_void_func_i8_zeroext(i8 zeroext) #0 + +declare hidden amdgpu_gfx void @external_void_func_i16(i16) #0 +declare hidden amdgpu_gfx void @external_void_func_i16_signext(i16 signext) #0 +declare hidden amdgpu_gfx void @external_void_func_i16_zeroext(i16 zeroext) #0 + +declare hidden amdgpu_gfx void @external_void_func_i32(i32) #0 +declare hidden amdgpu_gfx void @external_void_func_i64(i64) #0 +declare hidden amdgpu_gfx void @external_void_func_v2i64(<2 x i64>) #0 +declare hidden amdgpu_gfx void @external_void_func_v3i64(<3 x i64>) #0 +declare hidden amdgpu_gfx void @external_void_func_v4i64(<4 x i64>) #0 + +declare hidden amdgpu_gfx void @external_void_func_f16(half) #0 +declare hidden amdgpu_gfx void @external_void_func_f32(float) #0 +declare hidden amdgpu_gfx void @external_void_func_f64(double) #0 +declare hidden amdgpu_gfx void @external_void_func_v2f32(<2 x float>) #0 +declare hidden amdgpu_gfx void @external_void_func_v2f64(<2 x double>) #0 +declare hidden amdgpu_gfx void @external_void_func_v3f32(<3 x float>) #0 +declare hidden amdgpu_gfx void @external_void_func_v3f64(<3 x double>) #0 +declare hidden amdgpu_gfx void @external_void_func_v5f32(<5 x float>) #0 + +declare hidden amdgpu_gfx void @external_void_func_v2i16(<2 x i16>) #0 +declare hidden amdgpu_gfx void @external_void_func_v2f16(<2 x half>) #0 +declare hidden amdgpu_gfx void @external_void_func_v3i16(<3 x i16>) #0 +declare hidden amdgpu_gfx void @external_void_func_v3f16(<3 x half>) #0 +declare hidden amdgpu_gfx void @external_void_func_v4i16(<4 x i16>) #0 +declare hidden amdgpu_gfx void @external_void_func_v4f16(<4 x half>) #0 + +declare hidden amdgpu_gfx void @external_void_func_v2i32(<2 x i32>) #0 +declare hidden amdgpu_gfx void @external_void_func_v3i32(<3 x i32>) #0 +declare hidden amdgpu_gfx void @external_void_func_v3i32_i32(<3 x i32>, i32) #0 +declare hidden amdgpu_gfx void @external_void_func_v4i32(<4 x i32>) #0 +declare hidden amdgpu_gfx void @external_void_func_v5i32(<5 x i32>) #0 +declare hidden amdgpu_gfx void @external_void_func_v8i32(<8 x i32>) #0 +declare hidden amdgpu_gfx void @external_void_func_v16i32(<16 x i32>) #0 +declare hidden amdgpu_gfx void @external_void_func_v32i32(<32 x i32>) #0 +declare hidden amdgpu_gfx void @external_void_func_v32i32_i32(<32 x i32>, i32) #0 + +declare hidden amdgpu_gfx void @external_void_func_i1_inreg(i1 inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_i8_inreg(i8 inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_i16_inreg(i16 inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_i32_inreg(i32 inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_i64_inreg(i64 inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v3i64_inreg(<3 x i64> inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v4i64_inreg(<4 x i64> inreg) #0 + +declare hidden amdgpu_gfx void @external_void_func_f16_inreg(half inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_f32_inreg(float inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_f64_inreg(double inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v2f32_inreg(<2 x float> inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v2f64_inreg(<2 x double> inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v3f32_inreg(<3 x float> inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v3f64_inreg(<3 x double> inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v5f32_inreg(<5 x float> inreg) #0 + +declare hidden amdgpu_gfx void @external_void_func_v2i16_inreg(<2 x i16> inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v2f16_inreg(<2 x half> inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v4f16_inreg(<4 x half> inreg) #0 + +declare hidden amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v3i32_inreg(<3 x i32> inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v3i32_i32_inreg(<3 x i32> inreg, i32 inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v5i32_inreg(<5 x i32> inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v16i32_inreg(<16 x i32> inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v32i32_inreg(<32 x i32> inreg) #0 +declare hidden amdgpu_gfx void @external_void_func_v32i32_i32_inreg(<32 x i32> inreg, i32 inreg) #0 + +; return value and argument +declare hidden amdgpu_gfx i32 @external_i32_func_i32(i32) #0 + +; Structs +declare hidden amdgpu_gfx void @external_void_func_struct_i8_i32({ i8, i32 }) #0 +declare hidden amdgpu_gfx void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval) #0 +declare hidden amdgpu_gfx void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* sret, { i8, i32 } addrspace(5)* byval) #0 + +declare hidden amdgpu_gfx void @external_void_func_v16i8(<16 x i8>) #0 + +define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_i1_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_i1_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_i1(i1 true) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { +; GFX9-LABEL: test_call_external_void_func_i1_signext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_i1_signext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %var = load volatile i1, i1 addrspace(1)* undef + call amdgpu_gfx void @external_void_func_i1_signext(i1 %var) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { +; GFX9-LABEL: test_call_external_void_func_i1_zeroext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_i1_zeroext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %var = load volatile i1, i1 addrspace(1)* undef + call amdgpu_gfx void @external_void_func_i1_zeroext(i1 %var) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { +; GFX9-LABEL: test_call_external_void_func_i8_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_i8_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_i8(i8 123) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { +; GFX9-LABEL: test_call_external_void_func_i8_signext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_load_sbyte v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_i8_signext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_sbyte v0, v[0:1], off +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %var = load volatile i8, i8 addrspace(1)* undef + call amdgpu_gfx void @external_void_func_i8_signext(i8 %var) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { +; GFX9-LABEL: test_call_external_void_func_i8_zeroext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_i8_zeroext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %var = load volatile i8, i8 addrspace(1)* undef + call amdgpu_gfx void @external_void_func_i8_zeroext(i8 %var) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_i16_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_i16_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_i16(i16 123) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { +; GFX9-LABEL: test_call_external_void_func_i16_signext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_i16_signext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %var = load volatile i16, i16 addrspace(1)* undef + call amdgpu_gfx void @external_void_func_i16_signext(i16 %var) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { +; GFX9-LABEL: test_call_external_void_func_i16_zeroext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_i16_zeroext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %var = load volatile i16, i16 addrspace(1)* undef + call amdgpu_gfx void @external_void_func_i16_zeroext(i16 %var) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { +; GFX9-LABEL: test_call_external_void_func_i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_i32_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_i32(i32 42) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_i64_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_i64_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_i64(i64 123) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { +; GFX9-LABEL: test_call_external_void_func_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %val = load <2 x i64>, <2 x i64> addrspace(1)* null + call amdgpu_gfx void @external_void_func_v2i64(<2 x i64> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_v2i64_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: v_mov_b32_e32 v2, 3 +; GFX9-NEXT: v_mov_b32_e32 v3, 4 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v2i64_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_mov_b32_e32 v1, 2 +; GFX10-NEXT: v_mov_b32_e32 v2, 3 +; GFX10-NEXT: v_mov_b32_e32 v3, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v2i64(<2 x i64> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { +; GFX9-LABEL: test_call_external_void_func_v3i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v4, 1 +; GFX9-NEXT: v_mov_b32_e32 v5, 2 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v3i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 1 +; GFX10-NEXT: v_mov_b32_e32 v5, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %load = load <2 x i64>, <2 x i64> addrspace(1)* null + %val = shufflevector <2 x i64> %load, <2 x i64> , <3 x i32> + + call amdgpu_gfx void @external_void_func_v3i64(<3 x i64> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { +; GFX9-LABEL: test_call_external_void_func_v4i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v4, 1 +; GFX9-NEXT: v_mov_b32_e32 v5, 2 +; GFX9-NEXT: v_mov_b32_e32 v6, 3 +; GFX9-NEXT: v_mov_b32_e32 v7, 4 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v4i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 1 +; GFX10-NEXT: v_mov_b32_e32 v5, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v6, 3 +; GFX10-NEXT: v_mov_b32_e32 v7, 4 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %load = load <2 x i64>, <2 x i64> addrspace(1)* null + %val = shufflevector <2 x i64> %load, <2 x i64> , <4 x i32> + call amdgpu_gfx void @external_void_func_v4i64(<4 x i64> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_f16_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_f16_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_f16(half 4.0) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_f32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_f32_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_f32(float 4.0) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_v2f32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v2f32_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v2f32(<2 x float> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_v3f32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v3f32_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v3f32(<3 x float> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_v5f32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v3, -1.0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0.5 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v5f32_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX10-NEXT: v_mov_b32_e32 v3, -1.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0.5 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v5f32(<5 x float> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_f64_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_f64_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_f64(double 4.0) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_v2f64_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v2f64_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v2f64(<2 x double> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_v3f64_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x40200000 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v3f64_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0x40200000 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v3f64(<3 x double> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { +; GFX9-LABEL: test_call_external_void_func_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %val = load <2 x i16>, <2 x i16> addrspace(1)* undef + call amdgpu_gfx void @external_void_func_v2i16(<2 x i16> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { +; GFX9-LABEL: test_call_external_void_func_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v3i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %val = load <3 x i16>, <3 x i16> addrspace(1)* undef + call amdgpu_gfx void @external_void_func_v3i16(<3 x i16> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { +; GFX9-LABEL: test_call_external_void_func_v3f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v3f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %val = load <3 x half>, <3 x half> addrspace(1)* undef + call amdgpu_gfx void @external_void_func_v3f16(<3 x half> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_v3i16_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX9-NEXT: v_mov_b32_e32 v1, 3 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v3i16_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX10-NEXT: v_mov_b32_e32 v1, 3 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v3i16(<3 x i16> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_v3f16_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v3f16_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v3f16(<3 x half> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { +; GFX9-LABEL: test_call_external_void_func_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %val = load <4 x i16>, <4 x i16> addrspace(1)* undef + call amdgpu_gfx void @external_void_func_v4i16(<4 x i16> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_v4i16_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v4i16_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x40003 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v4i16(<4 x i16> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { +; GFX9-LABEL: test_call_external_void_func_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %val = load <2 x half>, <2 x half> addrspace(1)* undef + call amdgpu_gfx void @external_void_func_v2f16(<2 x half> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { +; GFX9-LABEL: test_call_external_void_func_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %val = load <2 x i32>, <2 x i32> addrspace(1)* undef + call amdgpu_gfx void @external_void_func_v2i32(<2 x i32> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_v2i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v2i32_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_mov_b32_e32 v1, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v2i32(<2 x i32> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { +; GFX9-LABEL: test_call_external_void_func_v3i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: v_mov_b32_e32 v2, 5 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v3i32_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, 4 +; GFX10-NEXT: v_mov_b32_e32 v2, 5 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v3i32(<3 x i32> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { +; GFX9-LABEL: test_call_external_void_func_v3i32_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: v_mov_b32_e32 v2, 5 +; GFX9-NEXT: v_mov_b32_e32 v3, 6 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v3i32_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, 4 +; GFX10-NEXT: v_mov_b32_e32 v2, 5 +; GFX10-NEXT: v_mov_b32_e32 v3, 6 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v3i32_i32(<3 x i32> , i32 6) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { +; GFX9-LABEL: test_call_external_void_func_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %val = load <4 x i32>, <4 x i32> addrspace(1)* undef + call amdgpu_gfx void @external_void_func_v4i32(<4 x i32> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_v4i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: v_mov_b32_e32 v2, 3 +; GFX9-NEXT: v_mov_b32_e32 v3, 4 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v4i32_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_mov_b32_e32 v1, 2 +; GFX10-NEXT: v_mov_b32_e32 v2, 3 +; GFX10-NEXT: v_mov_b32_e32 v3, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v4i32(<4 x i32> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_v5i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: v_mov_b32_e32 v2, 3 +; GFX9-NEXT: v_mov_b32_e32 v3, 4 +; GFX9-NEXT: v_mov_b32_e32 v4, 5 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v5i32_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_mov_b32_e32 v1, 2 +; GFX10-NEXT: v_mov_b32_e32 v2, 3 +; GFX10-NEXT: v_mov_b32_e32 v3, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 5 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v5i32(<5 x i32> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { +; GFX9-LABEL: test_call_external_void_func_v8i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[4:5], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[4:5], off offset:16 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v8i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[4:5], off +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[4:5], off offset:16 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef + %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr + call amdgpu_gfx void @external_void_func_v8i32(<8 x i32> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { +; GFX9-LABEL: test_call_external_void_func_v8i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: v_mov_b32_e32 v2, 3 +; GFX9-NEXT: v_mov_b32_e32 v3, 4 +; GFX9-NEXT: v_mov_b32_e32 v4, 5 +; GFX9-NEXT: v_mov_b32_e32 v5, 6 +; GFX9-NEXT: v_mov_b32_e32 v6, 7 +; GFX9-NEXT: v_mov_b32_e32 v7, 8 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v8i32_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_mov_b32_e32 v1, 2 +; GFX10-NEXT: v_mov_b32_e32 v2, 3 +; GFX10-NEXT: v_mov_b32_e32 v3, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 5 +; GFX10-NEXT: v_mov_b32_e32 v5, 6 +; GFX10-NEXT: v_mov_b32_e32 v6, 7 +; GFX10-NEXT: v_mov_b32_e32 v7, 8 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v8i32(<8 x i32> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { +; GFX9-LABEL: test_call_external_void_func_v16i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, s5 +; GFX9-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[12:13], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:48 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v16i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v13, s5 +; GFX10-NEXT: v_mov_b32_e32 v12, s4 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[12:13], off +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:32 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:48 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef + %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr + call amdgpu_gfx void @external_void_func_v16i32(<16 x i32> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { +; GFX9-LABEL: test_call_external_void_func_v32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v29, s5 +; GFX9-NEXT: v_mov_b32_e32 v28, s4 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[28:29], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48 +; GFX9-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64 +; GFX9-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80 +; GFX9-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96 +; GFX9-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v32i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v29, s5 +; GFX10-NEXT: v_mov_b32_e32 v28, s4 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[28:29], off +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48 +; GFX10-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64 +; GFX10-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80 +; GFX10-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96 +; GFX10-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef + %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr + call amdgpu_gfx void @external_void_func_v32i32(<32 x i32> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { +; GFX9-LABEL: test_call_external_void_func_v32i32_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v29, s5 +; GFX9-NEXT: v_mov_b32_e32 v28, s4 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[28:29], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48 +; GFX9-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64 +; GFX9-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80 +; GFX9-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96 +; GFX9-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: global_load_dword v32, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v32i32_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v29, s5 +; GFX10-NEXT: v_mov_b32_e32 v28, s4 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; GFX10-NEXT: global_load_dword v32, v[0:1], off +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[28:29], off +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48 +; GFX10-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64 +; GFX10-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80 +; GFX10-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96 +; GFX10-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112 +; GFX10-NEXT: s_waitcnt vmcnt(8) +; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef + %val0 = load <32 x i32>, <32 x i32> addrspace(1)* %ptr0 + %val1 = load i32, i32 addrspace(1)* undef + call amdgpu_gfx void @external_void_func_v32i32_i32(<32 x i32> %val0, i32 %val1) + ret void +} + +define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 { +; GFX9-LABEL: test_call_external_i32_func_i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v42, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_nop 2 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v42, s30, 0 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v42, s31, 1 +; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: global_store_dword v[40:41], v0, off +; GFX9-NEXT: v_readlane_b32 s4, v42, 0 +; GFX9-NEXT: v_readlane_b32 s5, v42, 1 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v42, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_i32_func_i32_imm: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v42, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_nop 1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_mov_b32_e32 v40, v0 +; GFX10-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v41, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v42, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: global_store_dword v[40:41], v0, off +; GFX10-NEXT: v_readlane_b32 s4, v42, 0 +; GFX10-NEXT: v_readlane_b32 s5, v42, 1 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v42, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[4:5] + %val = call amdgpu_gfx i32 @external_i32_func_i32(i32 42) + store volatile i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { +; GFX9-LABEL: test_call_external_void_func_struct_i8_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 +; GFX9-NEXT: global_load_ubyte v0, v[2:3], off +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_struct_i8_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_ubyte v0, v[1:2], off +; GFX10-NEXT: global_load_dword v1, v[1:2], off offset:4 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef + %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0 + call amdgpu_gfx void @external_void_func_struct_i8_i32({ i8, i32 } %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { +; GFX9-LABEL: test_call_external_void_func_byval_struct_i8_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_byval_struct_i8_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 +; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %val = alloca { i8, i32 }, align 4, addrspace(5) + %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 0 + %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 1 + store i8 3, i8 addrspace(5)* %gep0 + store i32 8, i32 addrspace(5)* %gep1 + call amdgpu_gfx void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 { +; GFX9-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 +; GFX9-NEXT: s_add_u32 s32, s32, 0x800 +; GFX9-NEXT: v_add_u32_e32 v0, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e64 v1, 6, s33 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x800 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_byte v[0:1], v0, off +; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 3 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 +; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10-NEXT: v_lshrrev_b32_e64 v1, 5, s33 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 8, v0 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_byte v[0:1], v0, off +; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[4:5] + %in.val = alloca { i8, i32 }, align 4, addrspace(5) + %out.val = alloca { i8, i32 }, align 4, addrspace(5) + %in.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 0 + %in.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 1 + store i8 3, i8 addrspace(5)* %in.gep0 + store i32 8, i32 addrspace(5)* %in.gep1 + call amdgpu_gfx void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* %out.val, { i8, i32 } addrspace(5)* %in.val) + %out.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 0 + %out.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 1 + %out.val0 = load i8, i8 addrspace(5)* %out.gep0 + %out.val1 = load i32, i32 addrspace(5)* %out.gep1 + + store volatile i8 %out.val0, i8 addrspace(1)* undef + store volatile i32 %out.val1, i32 addrspace(1)* undef + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { +; GFX9-LABEL: test_call_external_void_func_v16i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX9-NEXT: v_mov_b32_e32 v12, v3 +; GFX9-NEXT: v_mov_b32_e32 v1, v16 +; GFX9-NEXT: v_mov_b32_e32 v2, v17 +; GFX9-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v16i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v18, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX10-NEXT: v_mov_b32_e32 v12, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v16 +; GFX10-NEXT: v_mov_b32_e32 v2, v17 +; GFX10-NEXT: v_mov_b32_e32 v3, v18 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(4)* undef + %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + call amdgpu_gfx void @external_void_func_v16i8(<16 x i8> %val) + ret void +} + +define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { +; GFX9-LABEL: tail_call_byval_align16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:12 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:8 +; GFX9-NEXT: s_add_u32 s32, s32, 0x800 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x800 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: tail_call_byval_align16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x400 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:12 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:8 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] +entry: + %alloca = alloca double, align 8, addrspace(5) + tail call amdgpu_gfx void @byval_align16_f64_arg(<32 x i32> %val, double addrspace(5)* byval align 16 %alloca) + ret void +} + +; inreg arguments are put in sgprs +define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_i1_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_inreg@rel32@hi+12 +; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_i1_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i1_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_i1_inreg(i1 true) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { +; GFX9-LABEL: test_call_external_void_func_i8_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_movk_i32 s4, 0x7b +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i8_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i8_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_i8_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_i8_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_i8_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_i8_inreg(i8 123) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_i16_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_movk_i32 s4, 0x7b +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i16_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_i16_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_i16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_i16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_i16_inreg(i16 123) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { +; GFX9-LABEL: test_call_external_void_func_i32_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s4, 42 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i32_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_i32_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s4, 42 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_i32_inreg(i32 42) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_i64_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_movk_i32 s4, 0x7b +; GFX9-NEXT: s_mov_b32 s5, 0 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i64_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_i64_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_i64_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_i64_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_i64_inreg(i64 123) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v2i64_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[8:9] +; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v2i64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v2i64_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[8:9] +; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v2i64_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %val = load <2 x i64>, <2 x i64> addrspace(4)* null + call amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v2i64_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s4, 1 +; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: s_mov_b32 s6, 3 +; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: s_getpc_b64 s[8:9] +; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v2i64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v2i64_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: s_mov_b32 s6, 3 +; GFX10-NEXT: s_mov_b32 s7, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[8:9] +; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v2i64_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v3i64_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s8, 1 +; GFX9-NEXT: s_mov_b32 s9, 2 +; GFX9-NEXT: s_getpc_b64 s[10:11] +; GFX9-NEXT: s_add_u32 s10, s10, external_void_func_v3i64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s11, s11, external_void_func_v3i64_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[10:11] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v3i64_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 s8, 1 +; GFX10-NEXT: s_mov_b32 s9, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[10:11] +; GFX10-NEXT: s_add_u32 s10, s10, external_void_func_v3i64_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s11, s11, external_void_func_v3i64_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[10:11] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %load = load <2 x i64>, <2 x i64> addrspace(4)* null + %val = shufflevector <2 x i64> %load, <2 x i64> , <3 x i32> + + call amdgpu_gfx void @external_void_func_v3i64_inreg(<3 x i64> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v4i64_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s8, 1 +; GFX9-NEXT: s_mov_b32 s9, 2 +; GFX9-NEXT: s_mov_b32 s10, 3 +; GFX9-NEXT: s_mov_b32 s11, 4 +; GFX9-NEXT: s_getpc_b64 s[12:13] +; GFX9-NEXT: s_add_u32 s12, s12, external_void_func_v4i64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s13, s13, external_void_func_v4i64_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[12:13] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v4i64_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 s8, 1 +; GFX10-NEXT: s_mov_b32 s9, 2 +; GFX10-NEXT: s_mov_b32 s10, 3 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s11, 4 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[12:13] +; GFX10-NEXT: s_add_u32 s12, s12, external_void_func_v4i64_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s13, s13, external_void_func_v4i64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_swappc_b64 s[30:31], s[12:13] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %load = load <2 x i64>, <2 x i64> addrspace(4)* null + %val = shufflevector <2 x i64> %load, <2 x i64> , <4 x i32> + call amdgpu_gfx void @external_void_func_v4i64_inreg(<4 x i64> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_f16_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_movk_i32 s4, 0x4400 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_f16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_f16_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_f16_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_movk_i32 s4, 0x4400 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_f16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_f16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_f16_inreg(half 4.0) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_f32_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s4, 4.0 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_f32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_f32_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_f32_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s4, 4.0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_f32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_f32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_f32_inreg(float 4.0) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v2f32_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s4, 1.0 +; GFX9-NEXT: s_mov_b32 s5, 2.0 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2f32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2f32_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v2f32_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2f32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2f32_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v2f32_inreg(<2 x float> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v3f32_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s4, 1.0 +; GFX9-NEXT: s_mov_b32 s5, 2.0 +; GFX9-NEXT: s_mov_b32 s6, 4.0 +; GFX9-NEXT: s_getpc_b64 s[8:9] +; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v3f32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v3f32_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v3f32_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: s_mov_b32 s6, 4.0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[8:9] +; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v3f32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v3f32_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v3f32_inreg(<3 x float> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v5f32_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s4, 1.0 +; GFX9-NEXT: s_mov_b32 s5, 2.0 +; GFX9-NEXT: s_mov_b32 s6, 4.0 +; GFX9-NEXT: s_mov_b32 s7, -1.0 +; GFX9-NEXT: s_mov_b32 s8, 0.5 +; GFX9-NEXT: s_getpc_b64 s[10:11] +; GFX9-NEXT: s_add_u32 s10, s10, external_void_func_v5f32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s11, s11, external_void_func_v5f32_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[10:11] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v5f32_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: s_mov_b32 s6, 4.0 +; GFX10-NEXT: s_mov_b32 s7, -1.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s8, 0.5 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[10:11] +; GFX10-NEXT: s_add_u32 s10, s10, external_void_func_v5f32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s11, s11, external_void_func_v5f32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_swappc_b64 s[30:31], s[10:11] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v5f32_inreg(<5 x float> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_f64_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, 0x40100000 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_f64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_f64_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_f64_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b32 s5, 0x40100000 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_f64_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_f64_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_f64_inreg(double 4.0) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v2f64_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, 2.0 +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_mov_b32 s7, 0x40100000 +; GFX9-NEXT: s_getpc_b64 s[8:9] +; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v2f64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v2f64_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v2f64_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: s_mov_b32 s7, 0x40100000 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[8:9] +; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v2f64_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v2f64_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v2f64_inreg(<2 x double> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v3f64_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, 2.0 +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_mov_b32 s7, 0x40100000 +; GFX9-NEXT: s_mov_b32 s8, 0 +; GFX9-NEXT: s_mov_b32 s9, 0x40200000 +; GFX9-NEXT: s_getpc_b64 s[10:11] +; GFX9-NEXT: s_add_u32 s10, s10, external_void_func_v3f64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s11, s11, external_void_func_v3f64_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[10:11] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v3f64_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: s_mov_b32 s7, 0x40100000 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: s_mov_b32 s9, 0x40200000 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[10:11] +; GFX10-NEXT: s_add_u32 s10, s10, external_void_func_v3f64_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s11, s11, external_void_func_v3f64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_swappc_b64 s[30:31], s[10:11] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v3f64_inreg(<3 x double> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v2i16_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2i16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2i16_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v2i16_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2i16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2i16_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %val = load <2 x i16>, <2 x i16> addrspace(4)* undef + call amdgpu_gfx void @external_void_func_v2i16_inreg(<2 x i16> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v3i16_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v3i16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v3i16_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v3i16_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v3i16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v3i16_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %val = load <3 x i16>, <3 x i16> addrspace(4)* undef + call amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v3f16_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v3f16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v3f16_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v3f16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %val = load <3 x half>, <3 x half> addrspace(4)* undef + call amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v3i16_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s4, 0x20001 +; GFX9-NEXT: s_mov_b32 s5, 3 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v3i16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v3i16_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v3i16_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-NEXT: s_mov_b32 s5, 3 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v3i16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v3i16_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v3f16_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX9-NEXT: s_movk_i32 s5, 0x4400 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v3f16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v3f16_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX10-NEXT: s_movk_i32 s5, 0x4400 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v3f16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v4i16_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v4i16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v4i16_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v4i16_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v4i16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v4i16_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %val = load <4 x i16>, <4 x i16> addrspace(4)* undef + call amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v4i16_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s4, 0x20001 +; GFX9-NEXT: s_mov_b32 s5, 0x40003 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v4i16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v4i16_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v4i16_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-NEXT: s_mov_b32 s5, 0x40003 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v4i16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v4i16_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v2f16_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2f16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2f16_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v2f16_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2f16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2f16_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %val = load <2 x half>, <2 x half> addrspace(4)* undef + call amdgpu_gfx void @external_void_func_v2f16_inreg(<2 x half> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v2i32_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v2i32_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %val = load <2 x i32>, <2 x i32> addrspace(4)* undef + call amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v2i32_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s4, 1 +; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v2i32_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { +; GFX9-LABEL: test_call_external_void_func_v3i32_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s4, 3 +; GFX9-NEXT: s_mov_b32 s5, 4 +; GFX9-NEXT: s_mov_b32 s6, 5 +; GFX9-NEXT: s_getpc_b64 s[8:9] +; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v3i32_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s4, 3 +; GFX10-NEXT: s_mov_b32 s5, 4 +; GFX10-NEXT: s_mov_b32 s6, 5 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[8:9] +; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v3i32_inreg(<3 x i32> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { +; GFX9-LABEL: test_call_external_void_func_v3i32_i32_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s4, 3 +; GFX9-NEXT: s_mov_b32 s5, 4 +; GFX9-NEXT: s_mov_b32 s6, 5 +; GFX9-NEXT: s_mov_b32 s7, 6 +; GFX9-NEXT: s_getpc_b64 s[8:9] +; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_i32_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v3i32_i32_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s4, 3 +; GFX10-NEXT: s_mov_b32 s5, 4 +; GFX10-NEXT: s_mov_b32 s6, 5 +; GFX10-NEXT: s_mov_b32 s7, 6 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[8:9] +; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_i32_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v3i32_i32_inreg(<3 x i32> , i32 6) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v4i32_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[8:9] +; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v4i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v4i32_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[8:9] +; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v4i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %val = load <4 x i32>, <4 x i32> addrspace(4)* undef + call amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v4i32_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s4, 1 +; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: s_mov_b32 s6, 3 +; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: s_getpc_b64 s[8:9] +; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v4i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v4i32_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: s_mov_b32 s6, 3 +; GFX10-NEXT: s_mov_b32 s7, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[8:9] +; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v4i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v5i32_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s4, 1 +; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: s_mov_b32 s6, 3 +; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: s_mov_b32 s8, 5 +; GFX9-NEXT: s_getpc_b64 s[10:11] +; GFX9-NEXT: s_add_u32 s10, s10, external_void_func_v5i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s11, s11, external_void_func_v5i32_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[10:11] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v5i32_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: s_mov_b32 s6, 3 +; GFX10-NEXT: s_mov_b32 s7, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s8, 5 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[10:11] +; GFX10-NEXT: s_add_u32 s10, s10, external_void_func_v5i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s11, s11, external_void_func_v5i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_swappc_b64 s[30:31], s[10:11] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v5i32_inreg(<5 x i32> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v8i32_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX9-NEXT: s_getpc_b64 s[12:13] +; GFX9-NEXT: s_add_u32 s12, s12, external_void_func_v8i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s13, s13, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[12:13] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v8i32_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_getpc_b64 s[12:13] +; GFX10-NEXT: s_add_u32 s12, s12, external_void_func_v8i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s13, s13, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[12:13] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %ptr = load <8 x i32> addrspace(4)*, <8 x i32> addrspace(4)* addrspace(4)* undef + %val = load <8 x i32>, <8 x i32> addrspace(4)* %ptr + call amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v8i32_imm_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_mov_b32 s4, 1 +; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: s_mov_b32 s6, 3 +; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: s_mov_b32 s8, 5 +; GFX9-NEXT: s_mov_b32 s9, 6 +; GFX9-NEXT: s_mov_b32 s10, 7 +; GFX9-NEXT: s_mov_b32 s11, 8 +; GFX9-NEXT: s_getpc_b64 s[12:13] +; GFX9-NEXT: s_add_u32 s12, s12, external_void_func_v8i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s13, s13, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[12:13] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v8i32_imm_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: s_mov_b32 s6, 3 +; GFX10-NEXT: s_mov_b32 s7, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_mov_b32 s8, 5 +; GFX10-NEXT: s_mov_b32 s9, 6 +; GFX10-NEXT: s_mov_b32 s10, 7 +; GFX10-NEXT: s_mov_b32 s11, 8 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[12:13] +; GFX10-NEXT: s_add_u32 s12, s12, external_void_func_v8i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s13, s13, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_swappc_b64 s[30:31], s[12:13] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> ) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v16i32_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 +; GFX9-NEXT: s_getpc_b64 s[20:21] +; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v16i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v16i32_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v16i32_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_getpc_b64 s[20:21] +; GFX10-NEXT: s_add_u32 s20, s20, external_void_func_v16i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s21, s21, external_void_func_v16i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[20:21] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %ptr = load <16 x i32> addrspace(4)*, <16 x i32> addrspace(4)* addrspace(4)* undef + %val = load <16 x i32>, <16 x i32> addrspace(4)* %ptr + call amdgpu_gfx void @external_void_func_v16i32_inreg(<16 x i32> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { +; GFX9-LABEL: test_call_external_void_func_v32i32_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 18 +; GFX9-NEXT: v_writelane_b32 v40, s36, 0 +; GFX9-NEXT: v_writelane_b32 v40, s37, 1 +; GFX9-NEXT: v_writelane_b32 v40, s38, 2 +; GFX9-NEXT: v_writelane_b32 v40, s39, 3 +; GFX9-NEXT: v_writelane_b32 v40, s40, 4 +; GFX9-NEXT: v_writelane_b32 v40, s41, 5 +; GFX9-NEXT: v_writelane_b32 v40, s42, 6 +; GFX9-NEXT: v_writelane_b32 v40, s43, 7 +; GFX9-NEXT: v_writelane_b32 v40, s44, 8 +; GFX9-NEXT: v_writelane_b32 v40, s45, 9 +; GFX9-NEXT: v_writelane_b32 v40, s46, 10 +; GFX9-NEXT: v_writelane_b32 v40, s47, 11 +; GFX9-NEXT: v_writelane_b32 v40, s48, 12 +; GFX9-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s49, 13 +; GFX9-NEXT: v_writelane_b32 v40, s50, 14 +; GFX9-NEXT: v_writelane_b32 v40, s51, 15 +; GFX9-NEXT: v_writelane_b32 v40, s30, 16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 17 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s46 +; GFX9-NEXT: v_mov_b32_e32 v1, s47 +; GFX9-NEXT: v_mov_b32_e32 v2, s48 +; GFX9-NEXT: v_mov_b32_e32 v3, s49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, s50 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s51 +; GFX9-NEXT: s_mov_b32 s20, s36 +; GFX9-NEXT: s_mov_b32 s21, s37 +; GFX9-NEXT: s_mov_b32 s22, s38 +; GFX9-NEXT: s_mov_b32 s23, s39 +; GFX9-NEXT: s_mov_b32 s24, s40 +; GFX9-NEXT: s_mov_b32 s25, s41 +; GFX9-NEXT: s_mov_b32 s26, s42 +; GFX9-NEXT: s_mov_b32 s27, s43 +; GFX9-NEXT: s_mov_b32 s28, s44 +; GFX9-NEXT: s_mov_b32 s29, s45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s4, v40, 16 +; GFX9-NEXT: v_readlane_b32 s5, v40, 17 +; GFX9-NEXT: v_readlane_b32 s51, v40, 15 +; GFX9-NEXT: v_readlane_b32 s50, v40, 14 +; GFX9-NEXT: v_readlane_b32 s49, v40, 13 +; GFX9-NEXT: v_readlane_b32 s48, v40, 12 +; GFX9-NEXT: v_readlane_b32 s47, v40, 11 +; GFX9-NEXT: v_readlane_b32 s46, v40, 10 +; GFX9-NEXT: v_readlane_b32 s45, v40, 9 +; GFX9-NEXT: v_readlane_b32 s44, v40, 8 +; GFX9-NEXT: v_readlane_b32 s43, v40, 7 +; GFX9-NEXT: v_readlane_b32 s42, v40, 6 +; GFX9-NEXT: v_readlane_b32 s41, v40, 5 +; GFX9-NEXT: v_readlane_b32 s40, v40, 4 +; GFX9-NEXT: v_readlane_b32 s39, v40, 3 +; GFX9-NEXT: v_readlane_b32 s38, v40, 2 +; GFX9-NEXT: v_readlane_b32 s37, v40, 1 +; GFX9-NEXT: v_readlane_b32 s36, v40, 0 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 18 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v32i32_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 18 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s36, 0 +; GFX10-NEXT: v_writelane_b32 v40, s37, 1 +; GFX10-NEXT: v_writelane_b32 v40, s38, 2 +; GFX10-NEXT: v_writelane_b32 v40, s39, 3 +; GFX10-NEXT: v_writelane_b32 v40, s40, 4 +; GFX10-NEXT: v_writelane_b32 v40, s41, 5 +; GFX10-NEXT: v_writelane_b32 v40, s42, 6 +; GFX10-NEXT: v_writelane_b32 v40, s43, 7 +; GFX10-NEXT: v_writelane_b32 v40, s44, 8 +; GFX10-NEXT: v_writelane_b32 v40, s45, 9 +; GFX10-NEXT: v_writelane_b32 v40, s46, 10 +; GFX10-NEXT: v_writelane_b32 v40, s47, 11 +; GFX10-NEXT: v_writelane_b32 v40, s48, 12 +; GFX10-NEXT: v_writelane_b32 v40, s49, 13 +; GFX10-NEXT: v_writelane_b32 v40, s50, 14 +; GFX10-NEXT: v_writelane_b32 v40, s51, 15 +; GFX10-NEXT: v_writelane_b32 v40, s30, 16 +; GFX10-NEXT: v_writelane_b32 v40, s31, 17 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_inreg@rel32@hi+12 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s46 +; GFX10-NEXT: v_mov_b32_e32 v1, s47 +; GFX10-NEXT: v_mov_b32_e32 v2, s48 +; GFX10-NEXT: v_mov_b32_e32 v3, s49 +; GFX10-NEXT: s_mov_b32 s20, s36 +; GFX10-NEXT: s_mov_b32 s21, s37 +; GFX10-NEXT: s_mov_b32 s22, s38 +; GFX10-NEXT: s_mov_b32 s23, s39 +; GFX10-NEXT: s_mov_b32 s24, s40 +; GFX10-NEXT: s_mov_b32 s25, s41 +; GFX10-NEXT: s_mov_b32 s26, s42 +; GFX10-NEXT: s_mov_b32 s27, s43 +; GFX10-NEXT: s_mov_b32 s28, s44 +; GFX10-NEXT: s_mov_b32 s29, s45 +; GFX10-NEXT: v_mov_b32_e32 v4, s50 +; GFX10-NEXT: v_mov_b32_e32 v5, s51 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s4, v40, 16 +; GFX10-NEXT: v_readlane_b32 s5, v40, 17 +; GFX10-NEXT: v_readlane_b32 s51, v40, 15 +; GFX10-NEXT: v_readlane_b32 s50, v40, 14 +; GFX10-NEXT: v_readlane_b32 s49, v40, 13 +; GFX10-NEXT: v_readlane_b32 s48, v40, 12 +; GFX10-NEXT: v_readlane_b32 s47, v40, 11 +; GFX10-NEXT: v_readlane_b32 s46, v40, 10 +; GFX10-NEXT: v_readlane_b32 s45, v40, 9 +; GFX10-NEXT: v_readlane_b32 s44, v40, 8 +; GFX10-NEXT: v_readlane_b32 s43, v40, 7 +; GFX10-NEXT: v_readlane_b32 s42, v40, 6 +; GFX10-NEXT: v_readlane_b32 s41, v40, 5 +; GFX10-NEXT: v_readlane_b32 s40, v40, 4 +; GFX10-NEXT: v_readlane_b32 s39, v40, 3 +; GFX10-NEXT: v_readlane_b32 s38, v40, 2 +; GFX10-NEXT: v_readlane_b32 s37, v40, 1 +; GFX10-NEXT: v_readlane_b32 s36, v40, 0 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 18 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %ptr = load <32 x i32> addrspace(4)*, <32 x i32> addrspace(4)* addrspace(4)* undef + %val = load <32 x i32>, <32 x i32> addrspace(4)* %ptr + call amdgpu_gfx void @external_void_func_v32i32_inreg(<32 x i32> %val) + ret void +} + +define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { +; GFX9-LABEL: test_call_external_void_func_v32i32_i32_inreg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 18 +; GFX9-NEXT: v_writelane_b32 v40, s36, 0 +; GFX9-NEXT: v_writelane_b32 v40, s37, 1 +; GFX9-NEXT: v_writelane_b32 v40, s38, 2 +; GFX9-NEXT: v_writelane_b32 v40, s39, 3 +; GFX9-NEXT: v_writelane_b32 v40, s40, 4 +; GFX9-NEXT: v_writelane_b32 v40, s41, 5 +; GFX9-NEXT: v_writelane_b32 v40, s42, 6 +; GFX9-NEXT: v_writelane_b32 v40, s43, 7 +; GFX9-NEXT: v_writelane_b32 v40, s44, 8 +; GFX9-NEXT: v_writelane_b32 v40, s45, 9 +; GFX9-NEXT: v_writelane_b32 v40, s46, 10 +; GFX9-NEXT: v_writelane_b32 v40, s47, 11 +; GFX9-NEXT: v_writelane_b32 v40, s48, 12 +; GFX9-NEXT: v_writelane_b32 v40, s49, 13 +; GFX9-NEXT: v_writelane_b32 v40, s50, 14 +; GFX9-NEXT: v_writelane_b32 v40, s51, 15 +; GFX9-NEXT: v_writelane_b32 v40, s30, 16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 17 +; GFX9-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s22, s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 +; GFX9-NEXT: v_mov_b32_e32 v0, s22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s46 +; GFX9-NEXT: v_mov_b32_e32 v1, s47 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, s48 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, s49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, s50 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s51 +; GFX9-NEXT: s_mov_b32 s20, s36 +; GFX9-NEXT: s_mov_b32 s21, s37 +; GFX9-NEXT: s_mov_b32 s22, s38 +; GFX9-NEXT: s_mov_b32 s23, s39 +; GFX9-NEXT: s_mov_b32 s24, s40 +; GFX9-NEXT: s_mov_b32 s25, s41 +; GFX9-NEXT: s_mov_b32 s26, s42 +; GFX9-NEXT: s_mov_b32 s27, s43 +; GFX9-NEXT: s_mov_b32 s28, s44 +; GFX9-NEXT: s_mov_b32 s29, s45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s4, v40, 16 +; GFX9-NEXT: v_readlane_b32 s5, v40, 17 +; GFX9-NEXT: v_readlane_b32 s51, v40, 15 +; GFX9-NEXT: v_readlane_b32 s50, v40, 14 +; GFX9-NEXT: v_readlane_b32 s49, v40, 13 +; GFX9-NEXT: v_readlane_b32 s48, v40, 12 +; GFX9-NEXT: v_readlane_b32 s47, v40, 11 +; GFX9-NEXT: v_readlane_b32 s46, v40, 10 +; GFX9-NEXT: v_readlane_b32 s45, v40, 9 +; GFX9-NEXT: v_readlane_b32 s44, v40, 8 +; GFX9-NEXT: v_readlane_b32 s43, v40, 7 +; GFX9-NEXT: v_readlane_b32 s42, v40, 6 +; GFX9-NEXT: v_readlane_b32 s41, v40, 5 +; GFX9-NEXT: v_readlane_b32 s40, v40, 4 +; GFX9-NEXT: v_readlane_b32 s39, v40, 3 +; GFX9-NEXT: v_readlane_b32 s38, v40, 2 +; GFX9-NEXT: v_readlane_b32 s37, v40, 1 +; GFX9-NEXT: v_readlane_b32 s36, v40, 0 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 18 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_v32i32_i32_inreg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 18 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s36, 0 +; GFX10-NEXT: v_writelane_b32 v40, s37, 1 +; GFX10-NEXT: v_writelane_b32 v40, s38, 2 +; GFX10-NEXT: v_writelane_b32 v40, s39, 3 +; GFX10-NEXT: v_writelane_b32 v40, s40, 4 +; GFX10-NEXT: v_writelane_b32 v40, s41, 5 +; GFX10-NEXT: v_writelane_b32 v40, s42, 6 +; GFX10-NEXT: v_writelane_b32 v40, s43, 7 +; GFX10-NEXT: v_writelane_b32 v40, s44, 8 +; GFX10-NEXT: v_writelane_b32 v40, s45, 9 +; GFX10-NEXT: v_writelane_b32 v40, s46, 10 +; GFX10-NEXT: v_writelane_b32 v40, s47, 11 +; GFX10-NEXT: v_writelane_b32 v40, s48, 12 +; GFX10-NEXT: v_writelane_b32 v40, s49, 13 +; GFX10-NEXT: v_writelane_b32 v40, s50, 14 +; GFX10-NEXT: v_writelane_b32 v40, s51, 15 +; GFX10-NEXT: v_writelane_b32 v40, s30, 16 +; GFX10-NEXT: v_writelane_b32 v40, s31, 17 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dword s20, s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg@rel32@hi+12 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s20 +; GFX10-NEXT: v_mov_b32_e32 v1, s47 +; GFX10-NEXT: v_mov_b32_e32 v2, s48 +; GFX10-NEXT: v_mov_b32_e32 v3, s49 +; GFX10-NEXT: s_mov_b32 s20, s36 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX10-NEXT: v_mov_b32_e32 v0, s46 +; GFX10-NEXT: s_mov_b32 s21, s37 +; GFX10-NEXT: s_mov_b32 s22, s38 +; GFX10-NEXT: s_mov_b32 s23, s39 +; GFX10-NEXT: s_mov_b32 s24, s40 +; GFX10-NEXT: s_mov_b32 s25, s41 +; GFX10-NEXT: s_mov_b32 s26, s42 +; GFX10-NEXT: s_mov_b32 s27, s43 +; GFX10-NEXT: s_mov_b32 s28, s44 +; GFX10-NEXT: s_mov_b32 s29, s45 +; GFX10-NEXT: v_mov_b32_e32 v4, s50 +; GFX10-NEXT: v_mov_b32_e32 v5, s51 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s4, v40, 16 +; GFX10-NEXT: v_readlane_b32 s5, v40, 17 +; GFX10-NEXT: v_readlane_b32 s51, v40, 15 +; GFX10-NEXT: v_readlane_b32 s50, v40, 14 +; GFX10-NEXT: v_readlane_b32 s49, v40, 13 +; GFX10-NEXT: v_readlane_b32 s48, v40, 12 +; GFX10-NEXT: v_readlane_b32 s47, v40, 11 +; GFX10-NEXT: v_readlane_b32 s46, v40, 10 +; GFX10-NEXT: v_readlane_b32 s45, v40, 9 +; GFX10-NEXT: v_readlane_b32 s44, v40, 8 +; GFX10-NEXT: v_readlane_b32 s43, v40, 7 +; GFX10-NEXT: v_readlane_b32 s42, v40, 6 +; GFX10-NEXT: v_readlane_b32 s41, v40, 5 +; GFX10-NEXT: v_readlane_b32 s40, v40, 4 +; GFX10-NEXT: v_readlane_b32 s39, v40, 3 +; GFX10-NEXT: v_readlane_b32 s38, v40, 2 +; GFX10-NEXT: v_readlane_b32 s37, v40, 1 +; GFX10-NEXT: v_readlane_b32 s36, v40, 0 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 18 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %ptr0 = load <32 x i32> addrspace(4)*, <32 x i32> addrspace(4)* addrspace(4)* undef + %val0 = load <32 x i32>, <32 x i32> addrspace(4)* %ptr0 + %val1 = load i32, i32 addrspace(4)* undef + call amdgpu_gfx void @external_void_func_v32i32_i32_inreg(<32 x i32> %val0, i32 %val1) + ret void +} + +define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 { +; GFX9-LABEL: stack_passed_arg_alignment_v32i32_f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: stack_passed_arg_alignment_v32i32_f64: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] +entry: + call amdgpu_gfx void @stack_passed_f64_arg(<32 x i32> %val, double %tmp) + ret void +} + +define amdgpu_gfx void @stack_12xv3i32() #0 { +; GFX9-LABEL: stack_12xv3i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 1 +; GFX9-NEXT: v_mov_b32_e32 v4, 1 +; GFX9-NEXT: v_mov_b32_e32 v5, 1 +; GFX9-NEXT: v_mov_b32_e32 v6, 2 +; GFX9-NEXT: v_mov_b32_e32 v7, 2 +; GFX9-NEXT: v_mov_b32_e32 v8, 2 +; GFX9-NEXT: v_mov_b32_e32 v9, 3 +; GFX9-NEXT: v_mov_b32_e32 v10, 3 +; GFX9-NEXT: v_mov_b32_e32 v11, 3 +; GFX9-NEXT: v_mov_b32_e32 v12, 4 +; GFX9-NEXT: v_mov_b32_e32 v13, 4 +; GFX9-NEXT: v_mov_b32_e32 v14, 4 +; GFX9-NEXT: v_mov_b32_e32 v15, 5 +; GFX9-NEXT: v_mov_b32_e32 v16, 5 +; GFX9-NEXT: v_mov_b32_e32 v17, 5 +; GFX9-NEXT: v_mov_b32_e32 v18, 6 +; GFX9-NEXT: v_mov_b32_e32 v19, 6 +; GFX9-NEXT: v_mov_b32_e32 v20, 6 +; GFX9-NEXT: v_mov_b32_e32 v21, 7 +; GFX9-NEXT: v_mov_b32_e32 v22, 7 +; GFX9-NEXT: v_mov_b32_e32 v23, 7 +; GFX9-NEXT: v_mov_b32_e32 v24, 8 +; GFX9-NEXT: v_mov_b32_e32 v25, 8 +; GFX9-NEXT: v_mov_b32_e32 v26, 8 +; GFX9-NEXT: v_mov_b32_e32 v27, 9 +; GFX9-NEXT: v_mov_b32_e32 v28, 9 +; GFX9-NEXT: v_mov_b32_e32 v29, 9 +; GFX9-NEXT: v_mov_b32_e32 v30, 10 +; GFX9-NEXT: v_mov_b32_e32 v31, 11 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: stack_12xv3i32: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 12 +; GFX10-NEXT: v_mov_b32_e32 v1, 13 +; GFX10-NEXT: v_mov_b32_e32 v2, 14 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_mov_b32_e32 v4, 1 +; GFX10-NEXT: v_mov_b32_e32 v5, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 1 +; GFX10-NEXT: v_mov_b32_e32 v6, 2 +; GFX10-NEXT: v_mov_b32_e32 v7, 2 +; GFX10-NEXT: v_mov_b32_e32 v8, 2 +; GFX10-NEXT: v_mov_b32_e32 v9, 3 +; GFX10-NEXT: v_mov_b32_e32 v10, 3 +; GFX10-NEXT: v_mov_b32_e32 v11, 3 +; GFX10-NEXT: v_mov_b32_e32 v12, 4 +; GFX10-NEXT: v_mov_b32_e32 v13, 4 +; GFX10-NEXT: v_mov_b32_e32 v14, 4 +; GFX10-NEXT: v_mov_b32_e32 v15, 5 +; GFX10-NEXT: v_mov_b32_e32 v16, 5 +; GFX10-NEXT: v_mov_b32_e32 v17, 5 +; GFX10-NEXT: v_mov_b32_e32 v18, 6 +; GFX10-NEXT: v_mov_b32_e32 v19, 6 +; GFX10-NEXT: v_mov_b32_e32 v20, 6 +; GFX10-NEXT: v_mov_b32_e32 v21, 7 +; GFX10-NEXT: v_mov_b32_e32 v22, 7 +; GFX10-NEXT: v_mov_b32_e32 v23, 7 +; GFX10-NEXT: v_mov_b32_e32 v24, 8 +; GFX10-NEXT: v_mov_b32_e32 v25, 8 +; GFX10-NEXT: v_mov_b32_e32 v26, 8 +; GFX10-NEXT: v_mov_b32_e32 v27, 9 +; GFX10-NEXT: v_mov_b32_e32 v28, 9 +; GFX10-NEXT: v_mov_b32_e32 v29, 9 +; GFX10-NEXT: v_mov_b32_e32 v30, 10 +; GFX10-NEXT: v_mov_b32_e32 v31, 11 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] +entry: + call amdgpu_gfx void @external_void_func_12xv3i32( + <3 x i32>, + <3 x i32>, + <3 x i32>, + <3 x i32>, + <3 x i32>, + <3 x i32>, + <3 x i32>, + <3 x i32>, + <3 x i32>, + <3 x i32>, + <3 x i32>, + <3 x i32>) + ret void +} + +define amdgpu_gfx void @stack_8xv5i32() #0 { +; GFX9-LABEL: stack_8xv5i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, 12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; GFX9-NEXT: v_mov_b32_e32 v0, 14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 1 +; GFX9-NEXT: v_mov_b32_e32 v6, 1 +; GFX9-NEXT: v_mov_b32_e32 v7, 1 +; GFX9-NEXT: v_mov_b32_e32 v8, 1 +; GFX9-NEXT: v_mov_b32_e32 v9, 1 +; GFX9-NEXT: v_mov_b32_e32 v10, 2 +; GFX9-NEXT: v_mov_b32_e32 v11, 2 +; GFX9-NEXT: v_mov_b32_e32 v12, 2 +; GFX9-NEXT: v_mov_b32_e32 v13, 2 +; GFX9-NEXT: v_mov_b32_e32 v14, 2 +; GFX9-NEXT: v_mov_b32_e32 v15, 3 +; GFX9-NEXT: v_mov_b32_e32 v16, 3 +; GFX9-NEXT: v_mov_b32_e32 v17, 3 +; GFX9-NEXT: v_mov_b32_e32 v18, 3 +; GFX9-NEXT: v_mov_b32_e32 v19, 3 +; GFX9-NEXT: v_mov_b32_e32 v20, 4 +; GFX9-NEXT: v_mov_b32_e32 v21, 4 +; GFX9-NEXT: v_mov_b32_e32 v22, 4 +; GFX9-NEXT: v_mov_b32_e32 v23, 4 +; GFX9-NEXT: v_mov_b32_e32 v24, 4 +; GFX9-NEXT: v_mov_b32_e32 v25, 5 +; GFX9-NEXT: v_mov_b32_e32 v26, 5 +; GFX9-NEXT: v_mov_b32_e32 v27, 5 +; GFX9-NEXT: v_mov_b32_e32 v28, 5 +; GFX9-NEXT: v_mov_b32_e32 v29, 5 +; GFX9-NEXT: v_mov_b32_e32 v30, 6 +; GFX9-NEXT: v_mov_b32_e32 v31, 7 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: stack_8xv5i32: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 8 +; GFX10-NEXT: v_mov_b32_e32 v1, 9 +; GFX10-NEXT: v_mov_b32_e32 v2, 10 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_mov_b32_e32 v3, 14 +; GFX10-NEXT: v_mov_b32_e32 v4, 15 +; GFX10-NEXT: v_mov_b32_e32 v5, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GFX10-NEXT: v_mov_b32_e32 v0, 11 +; GFX10-NEXT: v_mov_b32_e32 v1, 12 +; GFX10-NEXT: v_mov_b32_e32 v2, 13 +; GFX10-NEXT: v_mov_b32_e32 v6, 1 +; GFX10-NEXT: v_mov_b32_e32 v7, 1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 +; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, 1 +; GFX10-NEXT: v_mov_b32_e32 v9, 1 +; GFX10-NEXT: v_mov_b32_e32 v10, 2 +; GFX10-NEXT: v_mov_b32_e32 v11, 2 +; GFX10-NEXT: v_mov_b32_e32 v12, 2 +; GFX10-NEXT: v_mov_b32_e32 v13, 2 +; GFX10-NEXT: v_mov_b32_e32 v14, 2 +; GFX10-NEXT: v_mov_b32_e32 v15, 3 +; GFX10-NEXT: v_mov_b32_e32 v16, 3 +; GFX10-NEXT: v_mov_b32_e32 v17, 3 +; GFX10-NEXT: v_mov_b32_e32 v18, 3 +; GFX10-NEXT: v_mov_b32_e32 v19, 3 +; GFX10-NEXT: v_mov_b32_e32 v20, 4 +; GFX10-NEXT: v_mov_b32_e32 v21, 4 +; GFX10-NEXT: v_mov_b32_e32 v22, 4 +; GFX10-NEXT: v_mov_b32_e32 v23, 4 +; GFX10-NEXT: v_mov_b32_e32 v24, 4 +; GFX10-NEXT: v_mov_b32_e32 v25, 5 +; GFX10-NEXT: v_mov_b32_e32 v26, 5 +; GFX10-NEXT: v_mov_b32_e32 v27, 5 +; GFX10-NEXT: v_mov_b32_e32 v28, 5 +; GFX10-NEXT: v_mov_b32_e32 v29, 5 +; GFX10-NEXT: v_mov_b32_e32 v30, 6 +; GFX10-NEXT: v_mov_b32_e32 v31, 7 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] +entry: + call amdgpu_gfx void @external_void_func_8xv5i32( + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>, + <5 x i32>) + ret void +} + +define amdgpu_gfx void @stack_8xv5f32() #0 { +; GFX9-LABEL: stack_8xv5f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41000000 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41100000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41200000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41300000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41400000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v6, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v7, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v8, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v9, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v10, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v11, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v12, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v13, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v14, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v16, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v17, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v18, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v19, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v20, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v21, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v22, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v23, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v24, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v25, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v26, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v27, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v28, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v29, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v30, 0x40c00000 +; GFX9-NEXT: v_mov_b32_e32 v31, 0x40e00000 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: stack_8xv5f32: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x41100000 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x41200000 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x41600000 +; GFX10-NEXT: v_mov_b32_e32 v4, 0x41700000 +; GFX10-NEXT: v_mov_b32_e32 v5, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x41300000 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x41400000 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x41500000 +; GFX10-NEXT: v_mov_b32_e32 v6, 1.0 +; GFX10-NEXT: v_mov_b32_e32 v7, 1.0 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 +; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, 1.0 +; GFX10-NEXT: v_mov_b32_e32 v9, 1.0 +; GFX10-NEXT: v_mov_b32_e32 v10, 2.0 +; GFX10-NEXT: v_mov_b32_e32 v11, 2.0 +; GFX10-NEXT: v_mov_b32_e32 v12, 2.0 +; GFX10-NEXT: v_mov_b32_e32 v13, 2.0 +; GFX10-NEXT: v_mov_b32_e32 v14, 2.0 +; GFX10-NEXT: v_mov_b32_e32 v15, 0x40400000 +; GFX10-NEXT: v_mov_b32_e32 v16, 0x40400000 +; GFX10-NEXT: v_mov_b32_e32 v17, 0x40400000 +; GFX10-NEXT: v_mov_b32_e32 v18, 0x40400000 +; GFX10-NEXT: v_mov_b32_e32 v19, 0x40400000 +; GFX10-NEXT: v_mov_b32_e32 v20, 4.0 +; GFX10-NEXT: v_mov_b32_e32 v21, 4.0 +; GFX10-NEXT: v_mov_b32_e32 v22, 4.0 +; GFX10-NEXT: v_mov_b32_e32 v23, 4.0 +; GFX10-NEXT: v_mov_b32_e32 v24, 4.0 +; GFX10-NEXT: v_mov_b32_e32 v25, 0x40a00000 +; GFX10-NEXT: v_mov_b32_e32 v26, 0x40a00000 +; GFX10-NEXT: v_mov_b32_e32 v27, 0x40a00000 +; GFX10-NEXT: v_mov_b32_e32 v28, 0x40a00000 +; GFX10-NEXT: v_mov_b32_e32 v29, 0x40a00000 +; GFX10-NEXT: v_mov_b32_e32 v30, 0x40c00000 +; GFX10-NEXT: v_mov_b32_e32 v31, 0x40e00000 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] +entry: + call amdgpu_gfx void @external_void_func_8xv5f32( + <5 x float>, + <5 x float>, + <5 x float>, + <5 x float>, + <5 x float>, + <5 x float>, + <5 x float>, + <5 x float>) + ret void +} + +declare hidden amdgpu_gfx void @byval_align16_f64_arg(<32 x i32>, double addrspace(5)* byval align 16) #0 +declare hidden amdgpu_gfx void @stack_passed_f64_arg(<32 x i32>, double) #0 +declare hidden amdgpu_gfx void @external_void_func_12xv3i32(<3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, + <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>) #0 +declare hidden amdgpu_gfx void @external_void_func_8xv5i32(<5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>, + <5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>) #0 +declare hidden amdgpu_gfx void @external_void_func_12xv3f32(<3 x float>, <3 x float>, <3 x float>, <3 x float>, + <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>) #0 +declare hidden amdgpu_gfx void @external_void_func_8xv5f32(<5 x float>, <5 x float>, <5 x float>, <5 x float>, + <5 x float>, <5 x float>, <5 x float>, <5 x float>) #0 +attributes #0 = { nounwind } +attributes #1 = { nounwind noinline } diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -0,0 +1,835 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s + +declare hidden amdgpu_gfx void @external_void_func_void() #0 + +define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { +; GFX9-LABEL: test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 4 +; GFX9-NEXT: v_writelane_b32 v40, s34, 0 +; GFX9-NEXT: v_writelane_b32 v40, s35, 1 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[34:35] +; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 3 +; GFX9-NEXT: v_readlane_b32 s35, v40, 1 +; GFX9-NEXT: v_readlane_b32 s34, v40, 0 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 4 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s35, 1 +; GFX10-NEXT: s_getpc_b64 s[34:35] +; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: v_readlane_b32 s4, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 3 +; GFX10-NEXT: v_readlane_b32 s35, v40, 1 +; GFX10-NEXT: v_readlane_b32 s34, v40, 0 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @external_void_func_void() + call void asm sideeffect "", ""() #0 + call amdgpu_gfx void @external_void_func_void() + ret void +} + +define amdgpu_gfx void @void_func_void_clobber_s30_s31() #1 { +; GFX9-LABEL: void_func_void_clobber_s30_s31: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b64 s[4:5], s[30:31] +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; clobber +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: void_func_void_clobber_s30_s31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b64 s[4:5], s[30:31] +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; clobber +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: s_setpc_b64 s[4:5] + call void asm sideeffect "; clobber", "~{s[30:31]}"() #0 + ret void +} + +define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)* %out) #0 { +; GFX9-LABEL: test_call_void_func_void_mayclobber_s31: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 3 +; GFX9-NEXT: v_writelane_b32 v40, s34, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s31 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s34, s31 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_mov_b32 s31, s34 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s31 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_readlane_b32 s4, v40, 1 +; GFX9-NEXT: v_readlane_b32 s5, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v40, 0 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_void_func_void_mayclobber_s31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; def s31 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: s_mov_b32 s34, s31 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_mov_b32 s31, s34 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use s31 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_readlane_b32 s4, v40, 1 +; GFX10-NEXT: v_readlane_b32 s5, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v40, 0 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %s31 = call i32 asm sideeffect "; def $0", "={s31}"() + call amdgpu_gfx void @external_void_func_void() + call void asm sideeffect "; use $0", "{s31}"(i32 %s31) + ret void +} + +define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 { +; GFX9-LABEL: test_call_void_func_void_mayclobber_v31: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v41, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v41, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s31, 1 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def v31 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v40, v31 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v31 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: v_readlane_b32 s5, v41, 1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v41, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_void_func_void_mayclobber_v31: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v41, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v41, s30, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v41, s31, 1 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; def v31 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_mov_b32_e32 v40, v31 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v31, v40 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v31 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_readlane_b32 s4, v41, 0 +; GFX10-NEXT: v_readlane_b32 s5, v41, 1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v41, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %v31 = call i32 asm sideeffect "; def $0", "={v31}"() + call amdgpu_gfx void @external_void_func_void() + call void asm sideeffect "; use $0", "{v31}"(i32 %v31) + ret void +} + + +define amdgpu_gfx void @test_call_void_func_void_preserves_s33(i32 addrspace(1)* %out) #0 { +; GFX9-LABEL: test_call_void_func_void_preserves_s33: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 3 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: v_writelane_b32 v40, s33, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s33 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s33 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_readlane_b32 s4, v40, 1 +; GFX9-NEXT: v_readlane_b32 s5, v40, 2 +; GFX9-NEXT: v_readlane_b32 s33, v40, 0 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_void_func_void_preserves_s33: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; def s33 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use s33 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_readlane_b32 s4, v40, 1 +; GFX10-NEXT: v_readlane_b32 s5, v40, 2 +; GFX10-NEXT: v_readlane_b32 s33, v40, 0 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %s33 = call i32 asm sideeffect "; def $0", "={s33}"() + call amdgpu_gfx void @external_void_func_void() + call void asm sideeffect "; use $0", "{s33}"(i32 %s33) + ret void +} + +define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)* %out) #0 { +; GFX9-LABEL: test_call_void_func_void_preserves_s34: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 3 +; GFX9-NEXT: v_writelane_b32 v40, s34, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s34 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s34 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_readlane_b32 s4, v40, 1 +; GFX9-NEXT: v_readlane_b32 s5, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v40, 0 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_void_func_void_preserves_s34: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; def s34 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use s34 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_readlane_b32 s4, v40, 1 +; GFX10-NEXT: v_readlane_b32 s5, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v40, 0 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %s34 = call i32 asm sideeffect "; def $0", "={s34}"() + call amdgpu_gfx void @external_void_func_void() + call void asm sideeffect "; use $0", "{s34}"(i32 %s34) + ret void +} + +define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)* %out) #0 { +; GFX9-LABEL: test_call_void_func_void_preserves_v40: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v41, s33, 2 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v41, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s31, 1 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def v40 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v40 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: v_readlane_b32 s5, v41, 1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v41, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_void_func_void_preserves_v40: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v41, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v41, s30, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v41, s31, 1 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; def v40 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v40 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_readlane_b32 s4, v41, 0 +; GFX10-NEXT: v_readlane_b32 s5, v41, 1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v41, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %v40 = call i32 asm sideeffect "; def $0", "={v40}"() + call amdgpu_gfx void @external_void_func_void() + call void asm sideeffect "; use $0", "{v40}"(i32 %v40) + ret void +} + +define hidden void @void_func_void_clobber_s33() #1 { +; GFX9-LABEL: void_func_void_clobber_s33: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_writelane_b32 v0, s33, 0 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; clobber +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_readlane_b32 s33, v0, 0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: void_func_void_clobber_s33: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_writelane_b32 v0, s33, 0 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; clobber +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_readlane_b32 s33, v0, 0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + call void asm sideeffect "; clobber", "~{s33}"() #0 + ret void +} + +define hidden void @void_func_void_clobber_s34() #1 { +; GFX9-LABEL: void_func_void_clobber_s34: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_writelane_b32 v0, s34, 0 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; clobber +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_readlane_b32 s34, v0, 0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: void_func_void_clobber_s34: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_writelane_b32 v0, s34, 0 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; clobber +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_readlane_b32 s34, v0, 0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + call void asm sideeffect "; clobber", "~{s34}"() #0 + ret void +} + +define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 { +; GFX9-LABEL: test_call_void_func_void_clobber_s33: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, void_func_void_clobber_s33@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, void_func_void_clobber_s33@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_void_func_void_clobber_s33: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, void_func_void_clobber_s33@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, void_func_void_clobber_s33@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @void_func_void_clobber_s33() + ret void +} + +define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 { +; GFX9-LABEL: test_call_void_func_void_clobber_s34: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, void_func_void_clobber_s34@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, void_func_void_clobber_s34@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: test_call_void_func_void_clobber_s34: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, void_func_void_clobber_s34@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, void_func_void_clobber_s34@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void @void_func_void_clobber_s34() + ret void +} + +define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { +; GFX9-LABEL: callee_saved_sgpr_kernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 3 +; GFX9-NEXT: v_writelane_b32 v40, s40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s40 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s40 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_readlane_b32 s4, v40, 1 +; GFX9-NEXT: v_readlane_b32 s5, v40, 2 +; GFX9-NEXT: v_readlane_b32 s40, v40, 0 +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: callee_saved_sgpr_kernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 3 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v40, s40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; def s40 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use s40 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_readlane_b32 s4, v40, 1 +; GFX10-NEXT: v_readlane_b32 s5, v40, 2 +; GFX10-NEXT: v_readlane_b32 s40, v40, 0 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 + call amdgpu_gfx void @external_void_func_void() + call void asm sideeffect "; use $0", "s"(i32 %s40) #0 + ret void +} + +define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { +; GFX9-LABEL: callee_saved_sgpr_vgpr_kernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v41, s33, 3 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v41, s40, 0 +; GFX9-NEXT: v_writelane_b32 v41, s30, 1 +; GFX9-NEXT: v_writelane_b32 v41, s31, 2 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s40 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def v32 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v40, v32 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s40 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v40 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_readlane_b32 s4, v41, 1 +; GFX9-NEXT: v_readlane_b32 s5, v41, 2 +; GFX9-NEXT: v_readlane_b32 s40, v41, 0 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: v_readlane_b32 s33, v41, 3 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX10-LABEL: callee_saved_sgpr_vgpr_kernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v41, s33, 3 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v41, s40, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_writelane_b32 v41, s30, 1 +; GFX10-NEXT: v_writelane_b32 v41, s31, 2 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; def s40 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; def v32 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_mov_b32_e32 v40, v32 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use s40 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; use v40 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_readlane_b32 s4, v41, 1 +; GFX10-NEXT: v_readlane_b32 s5, v41, 2 +; GFX10-NEXT: v_readlane_b32 s40, v41, 0 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_readlane_b32 s33, v41, 3 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[4:5] + %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 + %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0 + call amdgpu_gfx void @external_void_func_void() + call void asm sideeffect "; use $0", "s"(i32 %s40) #0 + call void asm sideeffect "; use $0", "v"(i32 %v32) #0 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind noinline } diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -469,3 +469,54 @@ %b = add i32 %a, 1 ret i32 %b } + +define void @test_indirect_call_vgpr_ptr_inreg_arg(void(i32)* %fptr) { +; GCN-LABEL: test_indirect_call_vgpr_ptr_inreg_arg: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_writelane_b32 v42, s33, 6 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x400 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v42, s34, 0 +; GCN-NEXT: v_writelane_b32 v42, s35, 1 +; GCN-NEXT: v_writelane_b32 v42, s36, 2 +; GCN-NEXT: v_writelane_b32 v42, s37, 3 +; GCN-NEXT: v_writelane_b32 v42, s30, 4 +; GCN-NEXT: v_writelane_b32 v42, s31, 5 +; GCN-NEXT: v_mov_b32_e32 v41, v1 +; GCN-NEXT: v_mov_b32_e32 v40, v0 +; GCN-NEXT: s_mov_b64 s[34:35], exec +; GCN-NEXT: BB5_1: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_readfirstlane_b32 s6, v40 +; GCN-NEXT: v_readfirstlane_b32 s7, v41 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[40:41] +; GCN-NEXT: s_and_saveexec_b64 s[36:37], vcc +; GCN-NEXT: s_movk_i32 s4, 0x7b +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_xor_b64 exec, exec, s[36:37] +; GCN-NEXT: s_cbranch_execnz BB5_1 +; GCN-NEXT: ; %bb.2: +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: v_readlane_b32 s4, v42, 4 +; GCN-NEXT: v_readlane_b32 s5, v42, 5 +; GCN-NEXT: v_readlane_b32 s37, v42, 3 +; GCN-NEXT: v_readlane_b32 s36, v42, 2 +; GCN-NEXT: v_readlane_b32 s35, v42, 1 +; GCN-NEXT: v_readlane_b32 s34, v42, 0 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_sub_u32 s32, s32, 0x400 +; GCN-NEXT: v_readlane_b32 s33, v42, 6 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] + call amdgpu_gfx void %fptr(i32 inreg 123) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll b/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll --- a/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll @@ -1,4 +1,5 @@ -; RUN: not llc -march=amdgcn -mtriple=amdgcn-- -tailcallopt < %s 2>&1 | FileCheck -check-prefix=GCN %s +; RUN: not llc -march=amdgcn -mtriple=amdgcn-mesa-mesa3d -tailcallopt < %s 2>&1 | FileCheck -check-prefixes=GCN,MESA %s +; RUN: not llc -march=amdgcn -mtriple=amdgcn--amdpal -tailcallopt < %s 2>&1 | FileCheck -check-prefixes=GCN,PAL %s ; RUN: not llc -march=r600 -mtriple=r600-- -mcpu=cypress -tailcallopt < %s 2>&1 | FileCheck -check-prefix=R600 %s declare i32 @external_function(i32) nounwind @@ -68,7 +69,7 @@ ret void } -; GCN: :0:0: in function test_call_from_shader i32 (): unsupported call from graphics shader of function defined_function +; GCN-NOT: :0:0: in function test_call_from_shader i32 (): unsupported call from graphics shader of function defined_function ; R600: in function test_call{{.*}}: unsupported call to function defined_function define amdgpu_ps i32 @test_call_from_shader() { %call = call i32 @defined_function(i32 0)