Index: llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -329,7 +329,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { const DataLayout &DL = getDataLayout(); const NVPTXSubtarget &STI = TM.getSubtarget(*F); - const TargetLowering *TLI = STI.getTargetLowering(); + const auto *TLI = cast(STI.getTargetLowering()); Type *Ty = F->getReturnType(); @@ -363,7 +363,7 @@ unsigned totalsz = DL.getTypeAllocSize(Ty); unsigned retAlignment = 0; if (!getAlign(*F, 0, retAlignment)) - retAlignment = DL.getABITypeAlignment(Ty); + retAlignment = TLI->getFunctionParamOptimizedAlign(F, Ty, DL).value(); O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz << "]"; } else @@ -1347,7 +1347,8 @@ const DataLayout &DL = getDataLayout(); const AttributeList &PAL = F->getAttributes(); const NVPTXSubtarget &STI = TM.getSubtarget(*F); - const TargetLowering *TLI = STI.getTargetLowering(); + const auto *TLI = cast(STI.getTargetLowering()); + Function::const_arg_iterator I, E; unsigned paramIndex = 0; bool first = true; @@ -1404,16 +1405,23 @@ } } + auto getOptimalAlignForParam = [TLI, &DL, &PAL, F, + paramIndex](Type *Ty) -> Align { + Align TypeAlign = TLI->getFunctionParamOptimizedAlign(F, Ty, DL); + MaybeAlign ParamAlign = PAL.getParamAlignment(paramIndex); + return max(TypeAlign, ParamAlign); + }; + if (!PAL.hasParamAttr(paramIndex, Attribute::ByVal)) { if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { // Just print .param .align .b8 .param[size]; - // = PAL.getparamalignment + // = optimal alignment for the element type; always multiple of + // PAL.getParamAlignment // size = typeallocsize of element type - const Align align = DL.getValueOrABITypeAlignment( - PAL.getParamAlignment(paramIndex), Ty); + Align OptimalAlign = getOptimalAlignForParam(Ty); unsigned sz = DL.getTypeAllocSize(Ty); - O << "\t.param .align " << align.value() << " .b8 "; + O << "\t.param .align " << OptimalAlign.value() << " .b8 "; printParamName(I, paramIndex, O); O << "[" << sz << "]"; @@ -1491,10 +1499,11 @@ if (isABI || isKernelFunc) { // Just print .param .align .b8 .param[size]; - // = PAL.getparamalignment + // = optimal alignment for the element type; always multiple of + // PAL.getParamAlignment // size = typeallocsize of element type - Align align = - DL.getValueOrABITypeAlignment(PAL.getParamAlignment(paramIndex), ETy); + Align OptimalAlign = getOptimalAlignForParam(ETy); + // Work around a bug in ptxas. When PTX code takes address of // byval parameter with alignment < 4, ptxas generates code to // spill argument into memory. Alas on sm_50+ ptxas generates @@ -1506,10 +1515,10 @@ // TODO: this will need to be undone when we get to support multi-TU // device-side compilation as it breaks ABI compatibility with nvcc. // Hopefully ptxas bug is fixed by then. - if (!isKernelFunc && align < Align(4)) - align = Align(4); + if (!isKernelFunc && OptimalAlign < Align(4)) + OptimalAlign = Align(4); unsigned sz = DL.getTypeAllocSize(ETy); - O << "\t.param .align " << align.value() << " .b8 "; + O << "\t.param .align " << OptimalAlign.value() << " .b8 "; printParamName(I, paramIndex, O); O << "[" << sz << "]"; continue; Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.h =================================================================== --- llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -451,6 +451,16 @@ MachineFunction &MF, unsigned Intrinsic) const override; + /// getFunctionParamOptimizedAlign - since function arguments are passed via + /// .param space, we may want to have increased alignmet in it to enhance + /// vectorization options. We can increase alignment only if the function has + /// internal or private linkage as for other linkage types callers may already + /// rely on default linkage. This function returns maximum from alignment 16 + /// and default alignment. Alignment 16 is chosen because it's maximum size + /// of vectorized ld.param & st.param. + Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, + const DataLayout &DL) const; + /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type /// Used to guide target specific optimizations, like loop strength Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1304,6 +1304,7 @@ unsigned OIdx = 0; for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { + const Function *F = CB.getFunction(); Type *Ty = Args[i].Ty; if (!first) { O << ", "; @@ -1312,13 +1313,13 @@ if (!Outs[OIdx].Flags.isByVal()) { if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { - unsigned align = 0; + unsigned ParamAlign = 0; const CallInst *CallI = cast(&CB); // +1 because index 0 is reserved for return type alignment - if (!getAlign(*CallI, i + 1, align)) - align = DL.getABITypeAlignment(Ty); + if (!getAlign(*CallI, i + 1, ParamAlign)) + ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value(); unsigned sz = DL.getTypeAllocSize(Ty); - O << ".param .align " << align << " .b8 "; + O << ".param .align " << ParamAlign << " .b8 "; O << "_"; O << "[" << sz << "]"; // update the index for Outs @@ -1352,9 +1353,16 @@ continue; } - Align align = Outs[OIdx].Flags.getNonZeroByValAlign(); + Align ParamByValAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); + + // Try to increase alignment. This code matches logic in LowerCall when + // alignment increase is performed to increase vectorization options. + Type *ETy = Args[i].IndirectType; + Align AlignCandidate = getFunctionParamOptimizedAlign(F, ETy, DL); + ParamByValAlign = std::max(ParamByValAlign, AlignCandidate); + unsigned sz = Outs[OIdx].Flags.getByValSize(); - O << ".param .align " << align.value() << " .b8 "; + O << ".param .align " << ParamByValAlign.value() << " .b8 "; O << "_"; O << "[" << sz << "]"; } @@ -1403,12 +1411,15 @@ // Check for function alignment information if we found that the // ultimate target is a Function - if (DirectCallee) + if (DirectCallee) { if (getAlign(*DirectCallee, Idx, Alignment)) return Align(Alignment); + // If alignment information is not available, fall back to the + // default function param optimized type alignment + return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL); + } - // Call is indirect or alignment information is not available, fall back to - // the ABI type alignment + // Call is indirect, fall back to the ABI type alignment return DL.getABITypeAlign(Ty); } @@ -1450,6 +1461,7 @@ unsigned OIdx = 0; // Declare the .params or .reg need to pass values // to the function + for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { EVT VT = Outs[OIdx].VT; Type *Ty = Args[i].Ty; @@ -1569,18 +1581,26 @@ } // ByVal arguments + // TODO: remove code duplication when handling byval and non-byval cases. SmallVector VTs; SmallVector Offsets; - assert(Args[i].IndirectType && "byval arg must have indirect type"); - ComputePTXValueVTs(*this, DL, Args[i].IndirectType, VTs, &Offsets, 0); + Type *ETy = Args[i].IndirectType; + assert(ETy && "byval arg must have indirect type"); + ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, 0); // declare .param .align .b8 .param[]; unsigned sz = Outs[OIdx].Flags.getByValSize(); SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); + // The ByValAlign in the Outs[OIdx].Flags is alway set at this point, // so we don't need to worry about natural alignment or not. // See TargetLowering::LowerCallTo(). + Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); + + // Try to increase alignment to enhance vectorization options. + const Function *F = CB->getCalledFunction(); + Align AlignCandidate = getFunctionParamOptimizedAlign(F, ETy, DL); + ArgAlign = std::max(ArgAlign, AlignCandidate); // Enforce minumum alignment of 4 to work around ptxas miscompile // for sm_50+. See corresponding alignment adjustment in @@ -1594,29 +1614,69 @@ Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, DeclareParamOps); InFlag = Chain.getValue(1); + + auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); + SmallVector StoreOperands; for (unsigned j = 0, je = VTs.size(); j != je; ++j) { EVT elemtype = VTs[j]; int curOffset = Offsets[j]; - unsigned PartAlign = GreatestCommonDivisor64(ArgAlign.value(), curOffset); + Align PartAlign = commonAlignment(ArgAlign, curOffset); + + // New store. + if (VectorInfo[j] & PVF_FIRST) { + assert(StoreOperands.empty() && "Unfinished preceding store."); + StoreOperands.push_back(Chain); + StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32)); + StoreOperands.push_back(DAG.getConstant(curOffset, dl, MVT::i32)); + } + auto PtrVT = getPointerTy(DL); SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx], DAG.getConstant(curOffset, dl, PtrVT)); SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, MachinePointerInfo(), PartAlign); + if (elemtype.getSizeInBits() < 16) { + // Use 16-bit registers for small stores as it's the + // smallest general purpose register size supported by NVPTX. theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); } - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(curOffset, dl, MVT::i32), - theVal, InFlag }; - Chain = DAG.getMemIntrinsicNode( - NVPTXISD::StoreParam, dl, CopyParamVTs, CopyParamOps, elemtype, - MachinePointerInfo(), /* Align */ None, MachineMemOperand::MOStore); - InFlag = Chain.getValue(1); + // Record the value to store. + StoreOperands.push_back(theVal); + + if (VectorInfo[j] & PVF_LAST) { + unsigned NumElts = StoreOperands.size() - 3; + NVPTXISD::NodeType Op; + switch (NumElts) { + case 1: + Op = NVPTXISD::StoreParam; + break; + case 2: + Op = NVPTXISD::StoreParamV2; + break; + case 4: + Op = NVPTXISD::StoreParamV4; + break; + default: + llvm_unreachable("Invalid vector info."); + } + + StoreOperands.push_back(InFlag); + + Chain = DAG.getMemIntrinsicNode( + Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, + elemtype, MachinePointerInfo(), PartAlign, + MachineMemOperand::MOStore); + InFlag = Chain.getValue(1); + + // Cleanup. + StoreOperands.clear(); + } } + assert(StoreOperands.empty() && "Unfinished parameter store."); + if (VTs.size() > 0) + --OIdx; ++paramCount; } @@ -2617,7 +2677,8 @@ const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &dl, SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); + const MachineFunction &MF = DAG.getMachineFunction(); + const Function &F = MF.getFunction(); Type *RetTy = MF.getFunction().getReturnType(); bool isABI = (STI.getSmVersion() >= 20); @@ -2632,7 +2693,9 @@ assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); auto VectorInfo = VectorizePTXValueVTs( - VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlign(RetTy) : Align(1)); + VTs, Offsets, + RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL) + : Align(1)); // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than // 32-bits are sign extended or zero extended, depending on whether @@ -4252,6 +4315,28 @@ return false; } +/// getFunctionParamOptimizedAlign - since function arguments are passed via +/// .param space, we may want to have increased alignmet in it to enhance +/// vectorization options. We can increase alignment only if the function has +/// internal or private linkage as for other linkage types callers may already +/// rely on default linkage. This function returns maximum from alignment 16 +/// and default alignment. Alignment 16 is chosen because it's maximum size +/// of vectorized ld.param & st.param. +Align NVPTXTargetLowering::getFunctionParamOptimizedAlign( + const Function *F, Type *ArgTy, const DataLayout &DL) const { + const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value(); + + // If a function has linkage different from internal or private, we + // must use default ABI alignment as external users rely on it. + switch (F->getLinkage()) { + case GlobalValue::InternalLinkage: + case GlobalValue::PrivateLinkage: + return Align(std::max(uint64_t(16), ABITypeAlign)); + default: + return Align(ABITypeAlign); + } +} + /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. /// Used to guide target specific optimizations, like loop strength reduction Index: llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -88,16 +88,17 @@ // cancel the addrspacecast pair this pass emits. //===----------------------------------------------------------------------===// +#include "MCTargetDesc/NVPTXBaseInfo.h" #include "NVPTX.h" #include "NVPTXTargetMachine.h" #include "NVPTXUtilities.h" -#include "MCTargetDesc/NVPTXBaseInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" +#include #define DEBUG_TYPE "nvptx-lower-args" @@ -226,6 +227,89 @@ [](Instruction *I) { I->eraseFromParent(); }); } +// Try to increase alignment in param address space. We can force a +// specific alignment for byval params to make vectorization possible, +// e.g. ld.param.v2* and ld.param.v4* instead of ld.param. +// Such changes must be conformed with parameter store and load in +// NVPTXTargetLowering::LowerCall. +static void tryIncreaseByValAlign(Argument *Arg, Value *ArgInParamAS, + const NVPTXTargetLowering *TLI) { + Function *Func = Arg->getParent(); + Type *StructType = Arg->getParamByValType(); + const DataLayout DL(Func->getParent()); + + uint64_t NewArgAlign = + TLI->getFunctionParamOptimizedAlign(Func, StructType, DL).value(); + uint64_t CurArgAlign = + Arg->getAttribute(Attribute::Alignment).getValueAsInt(); + + if (CurArgAlign >= NewArgAlign) + return; + + LLVM_DEBUG(dbgs() << "Try to use alignment " << NewArgAlign << " instead of " + << CurArgAlign << " for " << *Arg << '\n'); + + auto NewAlignAttr = + Attribute::get(Func->getContext(), Attribute::Alignment, NewArgAlign); + Arg->removeAttr(Attribute::Alignment); + Arg->addAttr(NewAlignAttr); + + struct Load { + LoadInst *Inst; + uint64_t Offset; + }; + + struct LoadContext { + Value *InitialVal; + uint64_t Offset; + }; + + SmallVector Loads; + std::queue Worklist; + Worklist.push({ArgInParamAS, 0}); + + while (!Worklist.empty()) { + LoadContext Ctx = Worklist.front(); + Worklist.pop(); + + for (User *CurUser : Ctx.InitialVal->users()) { + if (auto *I = dyn_cast(CurUser)) { + Loads.push_back({I, Ctx.Offset}); + continue; + } + + if (auto *I = dyn_cast(CurUser)) { + Worklist.push({I, Ctx.Offset}); + continue; + } + + if (auto *I = dyn_cast(CurUser)) { + APInt OffsetAccumulated = + APInt::getZero(DL.getIndexSizeInBits(ADDRESS_SPACE_PARAM)); + + if (!I->accumulateConstantOffset(DL, OffsetAccumulated)) + continue; + + uint64_t OffsetLimit = -1; + uint64_t Offset = OffsetAccumulated.getLimitedValue(OffsetLimit); + assert(Offset != OffsetLimit && "Expect Offset less than UINT64_MAX"); + + Worklist.push({I, Ctx.Offset + Offset}); + continue; + } + + llvm_unreachable("All users must be one of: load, " + "bitcast, getelementptr."); + } + } + + for (Load &CurLoad : Loads) { + Align NewLoadAlign(greatestCommonDivisor(NewArgAlign, CurLoad.Offset)); + Align CurLoadAlign(CurLoad.Inst->getAlign()); + CurLoad.Inst->setAlignment(std::max(NewLoadAlign, CurLoadAlign)); + } +} + void NVPTXLowerArgs::handleByValParam(Argument *Arg) { Function *Func = Arg->getParent(); Instruction *FirstInst = &(Func->getEntryBlock().front()); @@ -270,6 +354,16 @@ convertToParamAS(V, ArgInParamAS); }); LLVM_DEBUG(dbgs() << "No need to copy " << *Arg << "\n"); + + // Further optimizations require target lowering info. + if (!TM) + return; + + const auto *TLI = + cast(TM->getSubtargetImpl()->getTargetLowering()); + + tryIncreaseByValAlign(Arg, ArgInParamAS, TLI); + return; } Index: llvm/test/CodeGen/NVPTX/param-vectorize.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/NVPTX/param-vectorize.ll @@ -0,0 +1,838 @@ +; RUN: llc < %s -mtriple=nvptx-unknown-unknown | FileCheck %s +; +; This is IR generated with clang using -O3 optimization level +; and nvptx-unknown-unknown target from the following C code. +; +; struct St4x1 { uint32_t field[1]; }; +; struct St4x2 { uint32_t field[2]; }; +; struct St4x3 { uint32_t field[3]; }; +; struct St4x4 { uint32_t field[4]; }; +; struct St4x5 { uint32_t field[5]; }; +; struct St4x6 { uint32_t field[6]; }; +; struct St4x7 { uint32_t field[7]; }; +; struct St4x8 { uint32_t field[8]; }; +; struct St8x1 { uint64_t field[1]; }; +; struct St8x2 { uint64_t field[2]; }; +; struct St8x3 { uint64_t field[3]; }; +; struct St8x4 { uint64_t field[4]; }; +; +; #define DECLARE_CALLEE(StName) \ +; __attribute__((noinline)) \ +; struct StName callee_##StName(struct StName in) { \ +; struct StName ret; \ +; const unsigned size = sizeof(ret.field) / sizeof(*ret.field); \ +; for (unsigned i = 0; i != size; ++i) \ +; ret.field[i] = in.field[i]; \ +; return ret; \ +; } \ +; +; #define DECLARE_CALLER(StName) \ +; struct StName caller_##StName(struct StName* in) \ +; { \ +; struct StName out = callee_##StName(*in); \ +; return out; \ +; } \ +; +; #define DECLARE_CALL(StName) \ +; DECLARE_CALLEE(StName) \ +; DECLARE_CALLER(StName) \ +; +; DECLARE_CALL(St4x1) +; DECLARE_CALL(St4x2) +; DECLARE_CALL(St4x3) +; DECLARE_CALL(St4x4) +; DECLARE_CALL(St4x5) +; DECLARE_CALL(St4x6) +; DECLARE_CALL(St4x7) +; DECLARE_CALL(St4x8) +; DECLARE_CALL(St8x1) +; DECLARE_CALL(St8x2) +; DECLARE_CALL(St8x3) +; DECLARE_CALL(St8x4) + +%struct.St4x1 = type { [1 x i32] } +%struct.St4x2 = type { [2 x i32] } +%struct.St4x3 = type { [3 x i32] } +%struct.St4x4 = type { [4 x i32] } +%struct.St4x5 = type { [5 x i32] } +%struct.St4x6 = type { [6 x i32] } +%struct.St4x7 = type { [7 x i32] } +%struct.St4x8 = type { [8 x i32] } +%struct.St8x1 = type { [1 x i64] } +%struct.St8x2 = type { [2 x i64] } +%struct.St8x3 = type { [3 x i64] } +%struct.St8x4 = type { [4 x i64] } + +; Section 1 - checking that: +; - function argument (including retval) vectorization is done with internal linkage; +; - caller and callee specify correct alignment for callee's params; +; - caller with default (external) linkage has st.param unvectorized and its retval has default alignment. + +define internal %struct.St4x1 @callee_St4x1(%struct.St4x1* nocapture noundef readonly byval(%struct.St4x1) align 4 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[4]) + ; CHECK-LABEL: callee_St4x1( + ; CHECK-NEXT: .param .align 16 .b8 callee_St4x1_param_0[4] + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [callee_St4x1_param_0]; + ; CHECK: st.param.b32 [func_retval0+0], [[R1]]; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x1, %struct.St4x1* %in, i32 0, i32 0, i32 0 + %1 = load i32, i32* %arrayidx, align 4 + %.fca.0.0.insert = insertvalue %struct.St4x1 poison, i32 %1, 0, 0 + ret %struct.St4x1 %.fca.0.0.insert +} + +define %struct.St4x1 @caller_St4x1(%struct.St4x1* nocapture noundef readonly %in) { + ; CHECK: .func (.param .align 4 .b8 func_retval0[4]) + ; CHECK-LABEL: caller_St4x1( + ; CHECK-NEXT: .param .b32 caller_St4x1_param_0 + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [caller_St4x1_param_0]; + ; CHECK: ld.u32 [[R2:%r[0-9]+]], [[[R1]]]; + ; CHECK: .param .align 16 .b8 param0[4]; + ; CHECK: st.param.b32 [param0+0], [[R2]]; + ; CHECK: .param .align 16 .b8 retval0[4]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St4x1, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [retval0+0]; + ; CHECK: st.param.b32 [func_retval0+0], [[R3]]; + ; CHECK-NEXT: ret; + %call = tail call %struct.St4x1 @callee_St4x1(%struct.St4x1* noundef byval(%struct.St4x1) align 4 %in) + %1 = extractvalue %struct.St4x1 %call, 0 + %.fca.0.extract = extractvalue [1 x i32] %1, 0 + %.fca.0.0.insert = insertvalue %struct.St4x1 poison, i32 %.fca.0.extract, 0, 0 + ret %struct.St4x1 %.fca.0.0.insert +} + +define internal %struct.St4x2 @callee_St4x2(%struct.St4x2* nocapture noundef readonly byval(%struct.St4x2) align 4 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[8]) + ; CHECK-LABEL: callee_St4x2( + ; CHECK-NEXT: .param .align 16 .b8 callee_St4x2_param_0[8] + ; CHECK: ld.param.v2.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x2_param_0]; + ; CHECK: st.param.v2.b32 [func_retval0+0], {[[R1]], [[R2]]}; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x2, %struct.St4x2* %in, i32 0, i32 0, i32 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %in, i32 0, i32 0, i32 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %.fca.0.0.insert = insertvalue %struct.St4x2 poison, i32 %1, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St4x2 %.fca.0.0.insert, i32 %2, 0, 1 + ret %struct.St4x2 %.fca.0.1.insert +} + +define %struct.St4x2 @caller_St4x2(%struct.St4x2* nocapture noundef readonly %in) { + ; CHECK: .func (.param .align 4 .b8 func_retval0[8]) + ; CHECK-LABEL: caller_St4x2( + ; CHECK-NEXT: .param .b32 caller_St4x2_param_0 + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [caller_St4x2_param_0]; + ; CHECK: ld.u32 [[R2:%r[0-9]+]], [[[R1]]+4]; + ; CHECK: ld.u32 [[R3:%r[0-9]+]], [[[R1]]]; + ; CHECK: .param .align 16 .b8 param0[8]; + ; CHECK: st.param.v2.b32 [param0+0], {[[R3]], [[R2]]}; + ; CHECK: .param .align 16 .b8 retval0[8]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St4x2, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v2.b32 {[[R4:%r[0-9]+]], [[R5:%r[0-9]+]]}, [retval0+0]; + ; CHECK: st.param.b32 [func_retval0+0], [[R4]]; + ; CHECK: st.param.b32 [func_retval0+4], [[R5]]; + ; CHECK-NEXT: ret; + %call = tail call %struct.St4x2 @callee_St4x2(%struct.St4x2* noundef byval(%struct.St4x2) align 4 %in) + %1 = extractvalue %struct.St4x2 %call, 0 + %.fca.0.extract = extractvalue [2 x i32] %1, 0 + %.fca.1.extract = extractvalue [2 x i32] %1, 1 + %.fca.0.0.insert = insertvalue %struct.St4x2 poison, i32 %.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St4x2 %.fca.0.0.insert, i32 %.fca.1.extract, 0, 1 + ret %struct.St4x2 %.fca.0.1.insert +} + +define internal %struct.St4x3 @callee_St4x3(%struct.St4x3* nocapture noundef readonly byval(%struct.St4x3) align 4 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[12]) + ; CHECK-LABEL: callee_St4x3( + ; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12] + ; CHECK: ld.param.v2.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0]; + ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [callee_St4x3_param_0+8]; + ; CHECK: st.param.v2.b32 [func_retval0+0], {[[R1]], [[R2]]}; + ; CHECK: st.param.b32 [func_retval0+8], [[R3]]; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i32 0, i32 0, i32 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i32 0, i32 0, i32 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i32 0, i32 0, i32 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %.fca.0.0.insert = insertvalue %struct.St4x3 poison, i32 %1, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St4x3 %.fca.0.0.insert, i32 %2, 0, 1 + %.fca.0.2.insert = insertvalue %struct.St4x3 %.fca.0.1.insert, i32 %3, 0, 2 + ret %struct.St4x3 %.fca.0.2.insert +} + +define %struct.St4x3 @caller_St4x3(%struct.St4x3* nocapture noundef readonly %in) { + ; CHECK: .func (.param .align 4 .b8 func_retval0[12]) + ; CHECK-LABEL: caller_St4x3( + ; CHECK-NEXT: .param .b32 caller_St4x3_param_0 + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [caller_St4x3_param_0]; + ; CHECK: ld.u32 [[R2:%r[0-9]+]], [[[R1]]+4]; + ; CHECK: ld.u32 [[R3:%r[0-9]+]], [[[R1]]]; + ; CHECK: ld.u32 [[R4:%r[0-9]+]], [[[R1]]+8]; + ; CHECK: .param .align 16 .b8 param0[12]; + ; CHECK: st.param.v2.b32 [param0+0], {[[R3]], [[R2]]}; + ; CHECK: st.param.b32 [param0+8], [[R4]]; + ; CHECK: .param .align 16 .b8 retval0[12]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St4x3, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v2.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [retval0+0]; + ; CHECK: ld.param.b32 [[R7:%r[0-9]+]], [retval0+8]; + ; CHECK: st.param.b32 [func_retval0+0], [[R5]]; + ; CHECK: st.param.b32 [func_retval0+4], [[R6]]; + ; CHECK: st.param.b32 [func_retval0+8], [[R7]]; + ; CHECK-NEXT: ret; + %call = tail call %struct.St4x3 @callee_St4x3(%struct.St4x3* noundef byval(%struct.St4x3) align 4 %in) + %1 = extractvalue %struct.St4x3 %call, 0 + %.fca.0.extract = extractvalue [3 x i32] %1, 0 + %.fca.1.extract = extractvalue [3 x i32] %1, 1 + %.fca.2.extract = extractvalue [3 x i32] %1, 2 + %.fca.0.0.insert = insertvalue %struct.St4x3 poison, i32 %.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St4x3 %.fca.0.0.insert, i32 %.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.St4x3 %.fca.0.1.insert, i32 %.fca.2.extract, 0, 2 + ret %struct.St4x3 %.fca.0.2.insert +} + +define internal %struct.St4x4 @callee_St4x4(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[16]) + ; CHECK-LABEL: callee_St4x4( + ; CHECK-NEXT: .param .align 16 .b8 callee_St4x4_param_0[16] + ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_param_0]; + ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i32 0, i32 0, i32 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i32 0, i32 0, i32 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i32 0, i32 0, i32 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i32 0, i32 0, i32 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %.fca.0.0.insert = insertvalue %struct.St4x4 poison, i32 %1, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St4x4 %.fca.0.0.insert, i32 %2, 0, 1 + %.fca.0.2.insert = insertvalue %struct.St4x4 %.fca.0.1.insert, i32 %3, 0, 2 + %.fca.0.3.insert = insertvalue %struct.St4x4 %.fca.0.2.insert, i32 %4, 0, 3 + ret %struct.St4x4 %.fca.0.3.insert +} + +define %struct.St4x4 @caller_St4x4(%struct.St4x4* nocapture noundef readonly %in) { + ; CHECK: .func (.param .align 4 .b8 func_retval0[16]) + ; CHECK-LABEL: caller_St4x4( + ; CHECK-NEXT: .param .b32 caller_St4x4_param_0 + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [caller_St4x4_param_0]; + ; CHECK: ld.u32 [[R2:%r[0-9]+]], [[[R1]]+12]; + ; CHECK: ld.u32 [[R3:%r[0-9]+]], [[[R1]]+8]; + ; CHECK: ld.u32 [[R4:%r[0-9]+]], [[[R1]]+4]; + ; CHECK: ld.u32 [[R5:%r[0-9]+]], [[[R1]]]; + ; CHECK: .param .align 16 .b8 param0[16]; + ; CHECK: st.param.v4.b32 [param0+0], {[[R5]], [[R4]], [[R3]], [[R2]]}; + ; CHECK: .param .align 16 .b8 retval0[16]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St4x4, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v4.b32 {[[R6:%r[0-9]+]], [[R7:%r[0-9]+]], [[R8:%r[0-9]+]], [[R9:%r[0-9]+]]}, [retval0+0]; + ; CHECK: st.param.b32 [func_retval0+0], [[R6]]; + ; CHECK: st.param.b32 [func_retval0+4], [[R7]]; + ; CHECK: st.param.b32 [func_retval0+8], [[R8]]; + ; CHECK: st.param.b32 [func_retval0+12], [[R9]]; + ; CHECK-NEXT: ret; + %call = tail call %struct.St4x4 @callee_St4x4(%struct.St4x4* noundef byval(%struct.St4x4) align 4 %in) + %1 = extractvalue %struct.St4x4 %call, 0 + %.fca.0.extract = extractvalue [4 x i32] %1, 0 + %.fca.1.extract = extractvalue [4 x i32] %1, 1 + %.fca.2.extract = extractvalue [4 x i32] %1, 2 + %.fca.3.extract = extractvalue [4 x i32] %1, 3 + %.fca.0.0.insert = insertvalue %struct.St4x4 poison, i32 %.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St4x4 %.fca.0.0.insert, i32 %.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.St4x4 %.fca.0.1.insert, i32 %.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.St4x4 %.fca.0.2.insert, i32 %.fca.3.extract, 0, 3 + ret %struct.St4x4 %.fca.0.3.insert +} + +define internal %struct.St4x5 @callee_St4x5(%struct.St4x5* nocapture noundef readonly byval(%struct.St4x5) align 4 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[20]) + ; CHECK-LABEL: callee_St4x5( + ; CHECK-NEXT: .param .align 16 .b8 callee_St4x5_param_0[20] + ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x5_param_0]; + ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [callee_St4x5_param_0+16]; + ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; + ; CHECK: st.param.b32 [func_retval0+16], [[R5]]; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i32 0, i32 0, i32 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i32 0, i32 0, i32 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i32 0, i32 0, i32 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i32 0, i32 0, i32 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i32 0, i32 0, i32 4 + %5 = load i32, i32* %arrayidx.4, align 4 + %.fca.0.0.insert = insertvalue %struct.St4x5 poison, i32 %1, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St4x5 %.fca.0.0.insert, i32 %2, 0, 1 + %.fca.0.2.insert = insertvalue %struct.St4x5 %.fca.0.1.insert, i32 %3, 0, 2 + %.fca.0.3.insert = insertvalue %struct.St4x5 %.fca.0.2.insert, i32 %4, 0, 3 + %.fca.0.4.insert = insertvalue %struct.St4x5 %.fca.0.3.insert, i32 %5, 0, 4 + ret %struct.St4x5 %.fca.0.4.insert +} + +define %struct.St4x5 @caller_St4x5(%struct.St4x5* nocapture noundef readonly %in) { + ; CHECK: .func (.param .align 4 .b8 func_retval0[20]) + ; CHECK-LABEL: caller_St4x5( + ; CHECK-NEXT: .param .b32 caller_St4x5_param_0 + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [caller_St4x5_param_0]; + ; CHECK: ld.u32 [[R2:%r[0-9]+]], [[[R1]]+12]; + ; CHECK: ld.u32 [[R3:%r[0-9]+]], [[[R1]]+8]; + ; CHECK: ld.u32 [[R4:%r[0-9]+]], [[[R1]]+4]; + ; CHECK: ld.u32 [[R5:%r[0-9]+]], [[[R1]]]; + ; CHECK: ld.u32 [[R6:%r[0-9]+]], [[[R1]]+16]; + ; CHECK: .param .align 16 .b8 param0[20]; + ; CHECK: st.param.v4.b32 [param0+0], {[[R5]], [[R4]], [[R3]], [[R2]]}; + ; CHECK: st.param.b32 [param0+16], [[R6]]; + ; CHECK: .param .align 16 .b8 retval0[20]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St4x5, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v4.b32 {[[R7:%r[0-9]+]], [[R8:%r[0-9]+]], [[R9:%r[0-9]+]], [[R10:%r[0-9]+]]}, [retval0+0]; + ; CHECK: ld.param.b32 [[R11:%r[0-9]+]], [retval0+16]; + ; CHECK: st.param.b32 [func_retval0+0], [[R7]]; + ; CHECK: st.param.b32 [func_retval0+4], [[R8]]; + ; CHECK: st.param.b32 [func_retval0+8], [[R9]]; + ; CHECK: st.param.b32 [func_retval0+12], [[R10]]; + ; CHECK: st.param.b32 [func_retval0+16], [[R11]]; + ; CHECK-NEXT: ret; + %call = tail call %struct.St4x5 @callee_St4x5(%struct.St4x5* noundef byval(%struct.St4x5) align 4 %in) + %1 = extractvalue %struct.St4x5 %call, 0 + %.fca.0.extract = extractvalue [5 x i32] %1, 0 + %.fca.1.extract = extractvalue [5 x i32] %1, 1 + %.fca.2.extract = extractvalue [5 x i32] %1, 2 + %.fca.3.extract = extractvalue [5 x i32] %1, 3 + %.fca.4.extract = extractvalue [5 x i32] %1, 4 + %.fca.0.0.insert = insertvalue %struct.St4x5 poison, i32 %.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St4x5 %.fca.0.0.insert, i32 %.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.St4x5 %.fca.0.1.insert, i32 %.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.St4x5 %.fca.0.2.insert, i32 %.fca.3.extract, 0, 3 + %.fca.0.4.insert = insertvalue %struct.St4x5 %.fca.0.3.insert, i32 %.fca.4.extract, 0, 4 + ret %struct.St4x5 %.fca.0.4.insert +} + +define internal %struct.St4x6 @callee_St4x6(%struct.St4x6* nocapture noundef readonly byval(%struct.St4x6) align 4 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[24]) + ; CHECK-LABEL: callee_St4x6( + ; CHECK-NEXT: .param .align 16 .b8 callee_St4x6_param_0[24] + ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x6_param_0]; + ; CHECK: ld.param.v2.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x6_param_0+16]; + ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; + ; CHECK: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]}; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i32 0, i32 0, i32 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i32 0, i32 0, i32 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i32 0, i32 0, i32 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i32 0, i32 0, i32 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i32 0, i32 0, i32 4 + %5 = load i32, i32* %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i32 0, i32 0, i32 5 + %6 = load i32, i32* %arrayidx.5, align 4 + %.fca.0.0.insert = insertvalue %struct.St4x6 poison, i32 %1, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St4x6 %.fca.0.0.insert, i32 %2, 0, 1 + %.fca.0.2.insert = insertvalue %struct.St4x6 %.fca.0.1.insert, i32 %3, 0, 2 + %.fca.0.3.insert = insertvalue %struct.St4x6 %.fca.0.2.insert, i32 %4, 0, 3 + %.fca.0.4.insert = insertvalue %struct.St4x6 %.fca.0.3.insert, i32 %5, 0, 4 + %.fca.0.5.insert = insertvalue %struct.St4x6 %.fca.0.4.insert, i32 %6, 0, 5 + ret %struct.St4x6 %.fca.0.5.insert +} + +define %struct.St4x6 @caller_St4x6(%struct.St4x6* nocapture noundef readonly %in) { + ; CHECK: .func (.param .align 4 .b8 func_retval0[24]) + ; CHECK-LABEL: caller_St4x6( + ; CHECK-NEXT: .param .b32 caller_St4x6_param_0 + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [caller_St4x6_param_0]; + ; CHECK: ld.u32 [[R2:%r[0-9]+]], [[[R1]]+12]; + ; CHECK: ld.u32 [[R3:%r[0-9]+]], [[[R1]]+8]; + ; CHECK: ld.u32 [[R4:%r[0-9]+]], [[[R1]]+4]; + ; CHECK: ld.u32 [[R5:%r[0-9]+]], [[[R1]]]; + ; CHECK: ld.u32 [[R6:%r[0-9]+]], [[[R1]]+20]; + ; CHECK: ld.u32 [[R7:%r[0-9]+]], [[[R1]]+16]; + ; CHECK: .param .align 16 .b8 param0[24]; + ; CHECK: st.param.v4.b32 [param0+0], {[[R5]], [[R4]], [[R3]], [[R2]]}; + ; CHECK: st.param.v2.b32 [param0+16], {[[R7]], [[R6]]}; + ; CHECK: .param .align 16 .b8 retval0[24]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St4x6, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v4.b32 {[[R8:%r[0-9]+]], [[R9:%r[0-9]+]], [[R10:%r[0-9]+]], [[R11:%r[0-9]+]]}, [retval0+0]; + ; CHECK: ld.param.v2.b32 {[[R12:%r[0-9]+]], [[R13:%r[0-9]+]]}, [retval0+16]; + ; CHECK: st.param.b32 [func_retval0+0], [[R8]]; + ; CHECK: st.param.b32 [func_retval0+4], [[R9]]; + ; CHECK: st.param.b32 [func_retval0+8], [[R10]]; + ; CHECK: st.param.b32 [func_retval0+12], [[R11]]; + ; CHECK: st.param.b32 [func_retval0+16], [[R12]]; + ; CHECK: st.param.b32 [func_retval0+20], [[R13]]; + ; CHECK-NEXT: ret; + %call = tail call %struct.St4x6 @callee_St4x6(%struct.St4x6* noundef byval(%struct.St4x6) align 4 %in) + %1 = extractvalue %struct.St4x6 %call, 0 + %.fca.0.extract = extractvalue [6 x i32] %1, 0 + %.fca.1.extract = extractvalue [6 x i32] %1, 1 + %.fca.2.extract = extractvalue [6 x i32] %1, 2 + %.fca.3.extract = extractvalue [6 x i32] %1, 3 + %.fca.4.extract = extractvalue [6 x i32] %1, 4 + %.fca.5.extract = extractvalue [6 x i32] %1, 5 + %.fca.0.0.insert = insertvalue %struct.St4x6 poison, i32 %.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St4x6 %.fca.0.0.insert, i32 %.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.St4x6 %.fca.0.1.insert, i32 %.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.St4x6 %.fca.0.2.insert, i32 %.fca.3.extract, 0, 3 + %.fca.0.4.insert = insertvalue %struct.St4x6 %.fca.0.3.insert, i32 %.fca.4.extract, 0, 4 + %.fca.0.5.insert = insertvalue %struct.St4x6 %.fca.0.4.insert, i32 %.fca.5.extract, 0, 5 + ret %struct.St4x6 %.fca.0.5.insert +} + +define internal %struct.St4x7 @callee_St4x7(%struct.St4x7* nocapture noundef readonly byval(%struct.St4x7) align 4 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[28]) + ; CHECK-LABEL: callee_St4x7( + ; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28] + ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0]; + ; CHECK: ld.param.v2.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16]; + ; CHECK: ld.param.u32 [[R7:%r[0-9]+]], [callee_St4x7_param_0+24]; + ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; + ; CHECK: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]}; + ; CHECK: st.param.b32 [func_retval0+24], [[R7]]; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i32 0, i32 0, i32 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i32 0, i32 0, i32 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i32 0, i32 0, i32 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i32 0, i32 0, i32 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i32 0, i32 0, i32 4 + %5 = load i32, i32* %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i32 0, i32 0, i32 5 + %6 = load i32, i32* %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i32 0, i32 0, i32 6 + %7 = load i32, i32* %arrayidx.6, align 4 + %.fca.0.0.insert = insertvalue %struct.St4x7 poison, i32 %1, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St4x7 %.fca.0.0.insert, i32 %2, 0, 1 + %.fca.0.2.insert = insertvalue %struct.St4x7 %.fca.0.1.insert, i32 %3, 0, 2 + %.fca.0.3.insert = insertvalue %struct.St4x7 %.fca.0.2.insert, i32 %4, 0, 3 + %.fca.0.4.insert = insertvalue %struct.St4x7 %.fca.0.3.insert, i32 %5, 0, 4 + %.fca.0.5.insert = insertvalue %struct.St4x7 %.fca.0.4.insert, i32 %6, 0, 5 + %.fca.0.6.insert = insertvalue %struct.St4x7 %.fca.0.5.insert, i32 %7, 0, 6 + ret %struct.St4x7 %.fca.0.6.insert +} + +define %struct.St4x7 @caller_St4x7(%struct.St4x7* nocapture noundef readonly %in) { + ; CHECK: .func (.param .align 4 .b8 func_retval0[28]) + ; CHECK-LABEL: caller_St4x7( + ; CHECK-NEXT: .param .b32 caller_St4x7_param_0 + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [caller_St4x7_param_0]; + ; CHECK: ld.u32 [[R2:%r[0-9]+]], [[[R1]]+12]; + ; CHECK: ld.u32 [[R3:%r[0-9]+]], [[[R1]]+8]; + ; CHECK: ld.u32 [[R4:%r[0-9]+]], [[[R1]]+4]; + ; CHECK: ld.u32 [[R5:%r[0-9]+]], [[[R1]]]; + ; CHECK: ld.u32 [[R6:%r[0-9]+]], [[[R1]]+20]; + ; CHECK: ld.u32 [[R7:%r[0-9]+]], [[[R1]]+16]; + ; CHECK: ld.u32 [[R8:%r[0-9]+]], [[[R1]]+24]; + ; CHECK: .param .align 16 .b8 param0[28]; + ; CHECK: st.param.v4.b32 [param0+0], {[[R5]], [[R4]], [[R3]], [[R2]]}; + ; CHECK: st.param.v2.b32 [param0+16], {[[R7]], [[R6]]}; + ; CHECK: st.param.b32 [param0+24], [[R8]]; + ; CHECK: .param .align 16 .b8 retval0[28]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St4x7, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v4.b32 {[[R9:%r[0-9]+]], [[R10:%r[0-9]+]], [[R11:%r[0-9]+]], [[R12:%r[0-9]+]]}, [retval0+0]; + ; CHECK: ld.param.v2.b32 {[[R13:%r[0-9]+]], [[R14:%r[0-9]+]]}, [retval0+16]; + ; CHECK: ld.param.b32 [[R15:%r[0-9]+]], [retval0+24]; + ; CHECK: st.param.b32 [func_retval0+0], [[R9]]; + ; CHECK: st.param.b32 [func_retval0+4], [[R10]]; + ; CHECK: st.param.b32 [func_retval0+8], [[R11]]; + ; CHECK: st.param.b32 [func_retval0+12], [[R12]]; + ; CHECK: st.param.b32 [func_retval0+16], [[R13]]; + ; CHECK: st.param.b32 [func_retval0+20], [[R14]]; + ; CHECK: st.param.b32 [func_retval0+24], [[R15]]; + ; CHECK-NEXT: ret; + %call = tail call %struct.St4x7 @callee_St4x7(%struct.St4x7* noundef byval(%struct.St4x7) align 4 %in) + %1 = extractvalue %struct.St4x7 %call, 0 + %.fca.0.extract = extractvalue [7 x i32] %1, 0 + %.fca.1.extract = extractvalue [7 x i32] %1, 1 + %.fca.2.extract = extractvalue [7 x i32] %1, 2 + %.fca.3.extract = extractvalue [7 x i32] %1, 3 + %.fca.4.extract = extractvalue [7 x i32] %1, 4 + %.fca.5.extract = extractvalue [7 x i32] %1, 5 + %.fca.6.extract = extractvalue [7 x i32] %1, 6 + %.fca.0.0.insert = insertvalue %struct.St4x7 poison, i32 %.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St4x7 %.fca.0.0.insert, i32 %.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.St4x7 %.fca.0.1.insert, i32 %.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.St4x7 %.fca.0.2.insert, i32 %.fca.3.extract, 0, 3 + %.fca.0.4.insert = insertvalue %struct.St4x7 %.fca.0.3.insert, i32 %.fca.4.extract, 0, 4 + %.fca.0.5.insert = insertvalue %struct.St4x7 %.fca.0.4.insert, i32 %.fca.5.extract, 0, 5 + %.fca.0.6.insert = insertvalue %struct.St4x7 %.fca.0.5.insert, i32 %.fca.6.extract, 0, 6 + ret %struct.St4x7 %.fca.0.6.insert +} + +define internal %struct.St4x8 @callee_St4x8(%struct.St4x8* nocapture noundef readonly byval(%struct.St4x8) align 4 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[32]) + ; CHECK-LABEL: callee_St4x8( + ; CHECK-NEXT: .param .align 16 .b8 callee_St4x8_param_0[32] + ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x8_param_0]; + ; CHECK: ld.param.v4.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], [[R8:%r[0-9]+]]}, [callee_St4x8_param_0+16]; + ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; + ; CHECK: st.param.v4.b32 [func_retval0+16], {[[R5]], [[R6]], [[R7]], [[R8]]}; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i32 0, i32 0, i32 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i32 0, i32 0, i32 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i32 0, i32 0, i32 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i32 0, i32 0, i32 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i32 0, i32 0, i32 4 + %5 = load i32, i32* %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i32 0, i32 0, i32 5 + %6 = load i32, i32* %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i32 0, i32 0, i32 6 + %7 = load i32, i32* %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i32 0, i32 0, i32 7 + %8 = load i32, i32* %arrayidx.7, align 4 + %.fca.0.0.insert = insertvalue %struct.St4x8 poison, i32 %1, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St4x8 %.fca.0.0.insert, i32 %2, 0, 1 + %.fca.0.2.insert = insertvalue %struct.St4x8 %.fca.0.1.insert, i32 %3, 0, 2 + %.fca.0.3.insert = insertvalue %struct.St4x8 %.fca.0.2.insert, i32 %4, 0, 3 + %.fca.0.4.insert = insertvalue %struct.St4x8 %.fca.0.3.insert, i32 %5, 0, 4 + %.fca.0.5.insert = insertvalue %struct.St4x8 %.fca.0.4.insert, i32 %6, 0, 5 + %.fca.0.6.insert = insertvalue %struct.St4x8 %.fca.0.5.insert, i32 %7, 0, 6 + %.fca.0.7.insert = insertvalue %struct.St4x8 %.fca.0.6.insert, i32 %8, 0, 7 + ret %struct.St4x8 %.fca.0.7.insert +} + +define %struct.St4x8 @caller_St4x8(%struct.St4x8* nocapture noundef readonly %in) { + ; CHECK: .func (.param .align 4 .b8 func_retval0[32]) + ; CHECK-LABEL: caller_St4x8( + ; CHECK-NEXT: .param .b32 caller_St4x8_param_0 + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [caller_St4x8_param_0]; + ; CHECK: ld.u32 [[R2:%r[0-9]+]], [[[R1]]+12]; + ; CHECK: ld.u32 [[R3:%r[0-9]+]], [[[R1]]+8]; + ; CHECK: ld.u32 [[R4:%r[0-9]+]], [[[R1]]+4]; + ; CHECK: ld.u32 [[R5:%r[0-9]+]], [[[R1]]]; + ; CHECK: ld.u32 [[R6:%r[0-9]+]], [[[R1]]+28]; + ; CHECK: ld.u32 [[R7:%r[0-9]+]], [[[R1]]+24]; + ; CHECK: ld.u32 [[R8:%r[0-9]+]], [[[R1]]+20]; + ; CHECK: ld.u32 [[R9:%r[0-9]+]], [[[R1]]+16]; + ; CHECK: .param .align 16 .b8 param0[32]; + ; CHECK: st.param.v4.b32 [param0+0], {[[R5]], [[R4]], [[R3]], [[R2]]}; + ; CHECK: st.param.v4.b32 [param0+16], {[[R9]], [[R8]], [[R7]], [[R6]]}; + ; CHECK: .param .align 16 .b8 retval0[32]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St4x8, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v4.b32 {[[R10:%r[0-9]+]], [[R11:%r[0-9]+]], [[R12:%r[0-9]+]], [[R13:%r[0-9]+]]}, [retval0+0]; + ; CHECK: ld.param.v4.b32 {[[R14:%r[0-9]+]], [[R15:%r[0-9]+]], [[R16:%r[0-9]+]], [[R17:%r[0-9]+]]}, [retval0+16]; + ; CHECK: st.param.b32 [func_retval0+0], [[R10]]; + ; CHECK: st.param.b32 [func_retval0+4], [[R11]]; + ; CHECK: st.param.b32 [func_retval0+8], [[R12]]; + ; CHECK: st.param.b32 [func_retval0+12], [[R13]]; + ; CHECK: st.param.b32 [func_retval0+16], [[R14]]; + ; CHECK: st.param.b32 [func_retval0+20], [[R15]]; + ; CHECK: st.param.b32 [func_retval0+24], [[R16]]; + ; CHECK: st.param.b32 [func_retval0+28], [[R17]]; + ; CHECK-NEXT: ret; + %call = tail call %struct.St4x8 @callee_St4x8(%struct.St4x8* noundef byval(%struct.St4x8) align 4 %in) + %1 = extractvalue %struct.St4x8 %call, 0 + %.fca.0.extract = extractvalue [8 x i32] %1, 0 + %.fca.1.extract = extractvalue [8 x i32] %1, 1 + %.fca.2.extract = extractvalue [8 x i32] %1, 2 + %.fca.3.extract = extractvalue [8 x i32] %1, 3 + %.fca.4.extract = extractvalue [8 x i32] %1, 4 + %.fca.5.extract = extractvalue [8 x i32] %1, 5 + %.fca.6.extract = extractvalue [8 x i32] %1, 6 + %.fca.7.extract = extractvalue [8 x i32] %1, 7 + %.fca.0.0.insert = insertvalue %struct.St4x8 poison, i32 %.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St4x8 %.fca.0.0.insert, i32 %.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.St4x8 %.fca.0.1.insert, i32 %.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.St4x8 %.fca.0.2.insert, i32 %.fca.3.extract, 0, 3 + %.fca.0.4.insert = insertvalue %struct.St4x8 %.fca.0.3.insert, i32 %.fca.4.extract, 0, 4 + %.fca.0.5.insert = insertvalue %struct.St4x8 %.fca.0.4.insert, i32 %.fca.5.extract, 0, 5 + %.fca.0.6.insert = insertvalue %struct.St4x8 %.fca.0.5.insert, i32 %.fca.6.extract, 0, 6 + %.fca.0.7.insert = insertvalue %struct.St4x8 %.fca.0.6.insert, i32 %.fca.7.extract, 0, 7 + ret %struct.St4x8 %.fca.0.7.insert +} + +define internal %struct.St8x1 @callee_St8x1(%struct.St8x1* nocapture noundef readonly byval(%struct.St8x1) align 8 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[8]) + ; CHECK-LABEL: callee_St8x1( + ; CHECK-NEXT: .param .align 16 .b8 callee_St8x1_param_0[8] + ; CHECK: ld.param.u64 [[RD1:%rd[0-9]+]], [callee_St8x1_param_0]; + ; CHECK: st.param.b64 [func_retval0+0], [[RD1]]; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St8x1, %struct.St8x1* %in, i32 0, i32 0, i32 0 + %1 = load i64, i64* %arrayidx, align 8 + %.fca.0.0.insert = insertvalue %struct.St8x1 poison, i64 %1, 0, 0 + ret %struct.St8x1 %.fca.0.0.insert +} + +define %struct.St8x1 @caller_St8x1(%struct.St8x1* nocapture noundef readonly %in) { + ; CHECK: .func (.param .align 8 .b8 func_retval0[8]) + ; CHECK-LABEL: caller_St8x1( + ; CHECK-NEXT: .param .b32 caller_St8x1_param_0 + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [caller_St8x1_param_0]; + ; CHECK: ld.u64 [[RD1:%rd[0-9]+]], [[[R1]]]; + ; CHECK: .param .align 16 .b8 param0[8]; + ; CHECK: st.param.b64 [param0+0], [[RD1]]; + ; CHECK: .param .align 16 .b8 retval0[8]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St8x1, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.b64 [[RD2:%rd[0-9]+]], [retval0+0]; + ; CHECK: st.param.b64 [func_retval0+0], [[RD2]]; + ; CHECK-NEXT: ret; + %call = tail call %struct.St8x1 @callee_St8x1(%struct.St8x1* noundef byval(%struct.St8x1) align 8 %in) + %1 = extractvalue %struct.St8x1 %call, 0 + %.fca.0.extract = extractvalue [1 x i64] %1, 0 + %.fca.0.0.insert = insertvalue %struct.St8x1 poison, i64 %.fca.0.extract, 0, 0 + ret %struct.St8x1 %.fca.0.0.insert +} + +define internal %struct.St8x2 @callee_St8x2(%struct.St8x2* nocapture noundef readonly byval(%struct.St8x2) align 8 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[16]) + ; CHECK-LABEL: callee_St8x2( + ; CHECK-NEXT: .param .align 16 .b8 callee_St8x2_param_0[16] + ; CHECK: ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x2_param_0]; + ; CHECK: st.param.v2.b64 [func_retval0+0], {[[RD1]], [[RD2]]}; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St8x2, %struct.St8x2* %in, i32 0, i32 0, i32 0 + %1 = load i64, i64* %arrayidx, align 8 + %arrayidx.1 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %in, i32 0, i32 0, i32 1 + %2 = load i64, i64* %arrayidx.1, align 8 + %.fca.0.0.insert = insertvalue %struct.St8x2 poison, i64 %1, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St8x2 %.fca.0.0.insert, i64 %2, 0, 1 + ret %struct.St8x2 %.fca.0.1.insert +} + +define %struct.St8x2 @caller_St8x2(%struct.St8x2* nocapture noundef readonly %in) { + ; CHECK: .func (.param .align 8 .b8 func_retval0[16]) + ; CHECK-LABEL: caller_St8x2( + ; CHECK-NEXT: .param .b32 caller_St8x2_param_0 + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [caller_St8x2_param_0]; + ; CHECK: ld.u64 [[RD1:%rd[0-9]+]], [[[R1]]+8]; + ; CHECK: ld.u64 [[RD2:%rd[0-9]+]], [[[R1]]]; + ; CHECK: .param .align 16 .b8 param0[16]; + ; CHECK: st.param.v2.b64 [param0+0], {[[RD2]], [[RD1]]}; + ; CHECK: .param .align 16 .b8 retval0[16]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St8x2, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v2.b64 {[[RD3:%rd[0-9]+]], [[RD4:%rd[0-9]+]]}, [retval0+0]; + ; CHECK: st.param.b64 [func_retval0+0], [[RD3]]; + ; CHECK: st.param.b64 [func_retval0+8], [[RD4]]; + ; CHECK-NEXT: ret; + %call = tail call %struct.St8x2 @callee_St8x2(%struct.St8x2* noundef byval(%struct.St8x2) align 8 %in) + %1 = extractvalue %struct.St8x2 %call, 0 + %.fca.0.extract = extractvalue [2 x i64] %1, 0 + %.fca.1.extract = extractvalue [2 x i64] %1, 1 + %.fca.0.0.insert = insertvalue %struct.St8x2 poison, i64 %.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St8x2 %.fca.0.0.insert, i64 %.fca.1.extract, 0, 1 + ret %struct.St8x2 %.fca.0.1.insert +} + +define internal %struct.St8x3 @callee_St8x3(%struct.St8x3* nocapture noundef readonly byval(%struct.St8x3) align 8 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[24]) + ; CHECK-LABEL: callee_St8x3( + ; CHECK-NEXT: .param .align 16 .b8 callee_St8x3_param_0[24] + ; CHECK: ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x3_param_0]; + ; CHECK: ld.param.u64 [[RD3:%rd[0-9]+]], [callee_St8x3_param_0+16]; + ; CHECK: st.param.v2.b64 [func_retval0+0], {[[RD1]], [[RD2]]}; + ; CHECK: st.param.b64 [func_retval0+16], [[RD3]]; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i32 0, i32 0, i32 0 + %1 = load i64, i64* %arrayidx, align 8 + %arrayidx.1 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i32 0, i32 0, i32 1 + %2 = load i64, i64* %arrayidx.1, align 8 + %arrayidx.2 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i32 0, i32 0, i32 2 + %3 = load i64, i64* %arrayidx.2, align 8 + %.fca.0.0.insert = insertvalue %struct.St8x3 poison, i64 %1, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St8x3 %.fca.0.0.insert, i64 %2, 0, 1 + %.fca.0.2.insert = insertvalue %struct.St8x3 %.fca.0.1.insert, i64 %3, 0, 2 + ret %struct.St8x3 %.fca.0.2.insert +} + +define %struct.St8x3 @caller_St8x3(%struct.St8x3* nocapture noundef readonly %in) { + ; CHECK: .func (.param .align 8 .b8 func_retval0[24]) + ; CHECK-LABEL: caller_St8x3( + ; CHECK-NEXT: .param .b32 caller_St8x3_param_0 + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [caller_St8x3_param_0]; + ; CHECK: ld.u64 [[RD1:%rd[0-9]+]], [[[R1]]+8]; + ; CHECK: ld.u64 [[RD2:%rd[0-9]+]], [[[R1]]]; + ; CHECK: ld.u64 [[RD3:%rd[0-9]+]], [[[R1]]+16]; + ; CHECK: .param .align 16 .b8 param0[24]; + ; CHECK: st.param.v2.b64 [param0+0], {[[RD2]], [[RD1]]}; + ; CHECK: st.param.b64 [param0+16], [[RD3]]; + ; CHECK: .param .align 16 .b8 retval0[24]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St8x3, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v2.b64 {[[RD4:%rd[0-9]+]], [[RD5:%rd[0-9]+]]}, [retval0+0]; + ; CHECK: ld.param.b64 [[RD6:%rd[0-9]+]], [retval0+16]; + ; CHECK: st.param.b64 [func_retval0+0], [[RD4]]; + ; CHECK: st.param.b64 [func_retval0+8], [[RD5]]; + ; CHECK: st.param.b64 [func_retval0+16], [[RD6]]; + ; CHECK-NEXT: ret; + %call = tail call %struct.St8x3 @callee_St8x3(%struct.St8x3* noundef byval(%struct.St8x3) align 8 %in) + %1 = extractvalue %struct.St8x3 %call, 0 + %.fca.0.extract = extractvalue [3 x i64] %1, 0 + %.fca.1.extract = extractvalue [3 x i64] %1, 1 + %.fca.2.extract = extractvalue [3 x i64] %1, 2 + %.fca.0.0.insert = insertvalue %struct.St8x3 poison, i64 %.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St8x3 %.fca.0.0.insert, i64 %.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.St8x3 %.fca.0.1.insert, i64 %.fca.2.extract, 0, 2 + ret %struct.St8x3 %.fca.0.2.insert +} + +define internal %struct.St8x4 @callee_St8x4(%struct.St8x4* nocapture noundef readonly byval(%struct.St8x4) align 8 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[32]) + ; CHECK-LABEL: callee_St8x4( + ; CHECK-NEXT: .param .align 16 .b8 callee_St8x4_param_0[32] + ; CHECK: ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x4_param_0]; + ; CHECK: ld.param.v2.u64 {[[RD5:%rd[0-9]+]], [[RD6:%rd[0-9]+]]}, [callee_St8x4_param_0+16]; + ; CHECK: st.param.v2.b64 [func_retval0+0], {[[RD1]], [[RD2]]}; + ; CHECK: st.param.v2.b64 [func_retval0+16], {[[RD3]], [[RD4]]}; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i32 0, i32 0, i32 0 + %1 = load i64, i64* %arrayidx, align 8 + %arrayidx.1 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i32 0, i32 0, i32 1 + %2 = load i64, i64* %arrayidx.1, align 8 + %arrayidx.2 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i32 0, i32 0, i32 2 + %3 = load i64, i64* %arrayidx.2, align 8 + %arrayidx.3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i32 0, i32 0, i32 3 + %4 = load i64, i64* %arrayidx.3, align 8 + %.fca.0.0.insert = insertvalue %struct.St8x4 poison, i64 %1, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St8x4 %.fca.0.0.insert, i64 %2, 0, 1 + %.fca.0.2.insert = insertvalue %struct.St8x4 %.fca.0.1.insert, i64 %3, 0, 2 + %.fca.0.3.insert = insertvalue %struct.St8x4 %.fca.0.2.insert, i64 %4, 0, 3 + ret %struct.St8x4 %.fca.0.3.insert +} + +define %struct.St8x4 @caller_St8x4(%struct.St8x4* nocapture noundef readonly %in) { + ; CHECK: .func (.param .align 8 .b8 func_retval0[32]) + ; CHECK-LABEL: caller_St8x4( + ; CHECK-NEXT: .param .b32 caller_St8x4_param_0 + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [caller_St8x4_param_0]; + ; CHECK: ld.u64 [[RD1:%rd[0-9]+]], [[[R1]]+8]; + ; CHECK: ld.u64 [[RD2:%rd[0-9]+]], [[[R1]]]; + ; CHECK: ld.u64 [[RD3:%rd[0-9]+]], [[[R1]]+24]; + ; CHECK: ld.u64 [[RD4:%rd[0-9]+]], [[[R1]]+16]; + ; CHECK: .param .align 16 .b8 param0[32]; + ; CHECK: st.param.v2.b64 [param0+0], {[[RD2]], [[RD1]]}; + ; CHECK: st.param.v2.b64 [param0+16], {[[RD4]], [[RD3]]}; + ; CHECK: .param .align 16 .b8 retval0[32]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St8x4, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v2.b64 {[[RD5:%rd[0-9]+]], [[RD6:%rd[0-9]+]]}, [retval0+0]; + ; CHECK: ld.param.v2.b64 {[[RD7:%rd[0-9]+]], [[RD8:%rd[0-9]+]]}, [retval0+16]; + ; CHECK: st.param.b64 [func_retval0+0], [[RD5]]; + ; CHECK: st.param.b64 [func_retval0+8], [[RD6]]; + ; CHECK: st.param.b64 [func_retval0+16], [[RD7]]; + ; CHECK: st.param.b64 [func_retval0+24], [[RD8]]; + ; CHECK-NEXT: ret; + %call = tail call %struct.St8x4 @callee_St8x4(%struct.St8x4* noundef byval(%struct.St8x4) align 8 %in) + %1 = extractvalue %struct.St8x4 %call, 0 + %.fca.0.extract = extractvalue [4 x i64] %1, 0 + %.fca.1.extract = extractvalue [4 x i64] %1, 1 + %.fca.2.extract = extractvalue [4 x i64] %1, 2 + %.fca.3.extract = extractvalue [4 x i64] %1, 3 + %.fca.0.0.insert = insertvalue %struct.St8x4 poison, i64 %.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St8x4 %.fca.0.0.insert, i64 %.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.St8x4 %.fca.0.1.insert, i64 %.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.St8x4 %.fca.0.2.insert, i64 %.fca.3.extract, 0, 3 + ret %struct.St8x4 %.fca.0.3.insert +} + +; Section 2 - checking that function argument (including retval) vectorization is done with private linkage; + +define private %struct.St4x4 @callee_St4x4_private(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[16]) + ; CHECK-LABEL: callee_St4x4_private( + ; CHECK-NEXT: .param .align 16 .b8 callee_St4x4_private_param_0[16] + ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_private_param_0]; + ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i32 0, i32 0, i32 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i32 0, i32 0, i32 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i32 0, i32 0, i32 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i32 0, i32 0, i32 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %.fca.0.0.insert = insertvalue %struct.St4x4 poison, i32 %1, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St4x4 %.fca.0.0.insert, i32 %2, 0, 1 + %.fca.0.2.insert = insertvalue %struct.St4x4 %.fca.0.1.insert, i32 %3, 0, 2 + %.fca.0.3.insert = insertvalue %struct.St4x4 %.fca.0.2.insert, i32 %4, 0, 3 + ret %struct.St4x4 %.fca.0.3.insert +} + +; Section 3 - checking that function argument (including retval) vectorization +; is NOT done with linkage types other than internal and private. + +define external %struct.St4x4 @callee_St4x4_external(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in) { + ; CHECK: .func (.param .align 4 .b8 func_retval0[16]) + ; CHECK-LABEL: callee_St4x4_external( + ; CHECK-NEXT: .param .align 4 .b8 callee_St4x4_external_param_0[16] + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [callee_St4x4_external_param_0]; + ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [callee_St4x4_external_param_0+4]; + ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [callee_St4x4_external_param_0+8]; + ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [callee_St4x4_external_param_0+12]; + ; CHECK: st.param.b32 [func_retval0+0], [[R1]]; + ; CHECK: st.param.b32 [func_retval0+4], [[R2]]; + ; CHECK: st.param.b32 [func_retval0+8], [[R3]]; + ; CHECK: st.param.b32 [func_retval0+12], [[R4]]; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i32 0, i32 0, i32 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i32 0, i32 0, i32 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i32 0, i32 0, i32 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i32 0, i32 0, i32 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %.fca.0.0.insert = insertvalue %struct.St4x4 poison, i32 %1, 0, 0 + %.fca.0.1.insert = insertvalue %struct.St4x4 %.fca.0.0.insert, i32 %2, 0, 1 + %.fca.0.2.insert = insertvalue %struct.St4x4 %.fca.0.1.insert, i32 %3, 0, 2 + %.fca.0.3.insert = insertvalue %struct.St4x4 %.fca.0.2.insert, i32 %4, 0, 3 + ret %struct.St4x4 %.fca.0.3.insert +}