Index: clang/test/CodeGenCUDA/device-fun-linkage.cu =================================================================== --- clang/test/CodeGenCUDA/device-fun-linkage.cu +++ clang/test/CodeGenCUDA/device-fun-linkage.cu @@ -7,13 +7,24 @@ #include "Inputs/cuda.h" -// NORDC: define internal void @_Z4funcIiEvv() -// NORDC: define{{.*}} void @_Z6kernelIiEvv() -// RDC: define weak_odr void @_Z4funcIiEvv() -// RDC: define weak_odr void @_Z6kernelIiEvv() - template __device__ void func() {} template __global__ void kernel() {} template __device__ void func(); +// NORDC: define internal void @_Z4funcIiEvv() +// RDC: define weak_odr void @_Z4funcIiEvv() + template __global__ void kernel(); +// NORDC: define void @_Z6kernelIiEvv() +// RDC: define weak_odr void @_Z6kernelIiEvv() + +// Ensure that unused static device function is eliminated +static __device__ void static_func() {} +// NORDC-NOT: define{{.*}} void @_ZL13static_funcv() +// RDC-NOT: define{{.*}} void @_ZL13static_funcv() + +// Ensure that kernel function has external or weak_odr +// linkage regardless static specifier +static __global__ void static_kernel() {} +// NORDC: define void @_ZL13static_kernelv() +// RDC: define weak_odr void @_ZL13static_kernelv() Index: llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -329,7 +329,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { const DataLayout &DL = getDataLayout(); const NVPTXSubtarget &STI = TM.getSubtarget(*F); - const TargetLowering *TLI = STI.getTargetLowering(); + const auto *TLI = cast(STI.getTargetLowering()); Type *Ty = F->getReturnType(); @@ -363,7 +363,7 @@ unsigned totalsz = DL.getTypeAllocSize(Ty); unsigned retAlignment = 0; if (!getAlign(*F, 0, retAlignment)) - retAlignment = DL.getABITypeAlignment(Ty); + retAlignment = TLI->getFunctionParamOptimizedAlign(F, Ty, DL).value(); O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz << "]"; } else @@ -1347,7 +1347,8 @@ const DataLayout &DL = getDataLayout(); const AttributeList &PAL = F->getAttributes(); const NVPTXSubtarget &STI = TM.getSubtarget(*F); - const TargetLowering *TLI = STI.getTargetLowering(); + const auto *TLI = cast(STI.getTargetLowering()); + Function::const_arg_iterator I, E; unsigned paramIndex = 0; bool first = true; @@ -1404,16 +1405,23 @@ } } + auto getOptimalAlignForParam = [TLI, &DL, &PAL, F, + paramIndex](Type *Ty) -> Align { + Align TypeAlign = TLI->getFunctionParamOptimizedAlign(F, Ty, DL); + MaybeAlign ParamAlign = PAL.getParamAlignment(paramIndex); + return max(TypeAlign, ParamAlign); + }; + if (!PAL.hasParamAttr(paramIndex, Attribute::ByVal)) { if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { // Just print .param .align .b8 .param[size]; - // = PAL.getparamalignment + // = optimal alignment for the element type; always multiple of + // PAL.getParamAlignment // size = typeallocsize of element type - const Align align = DL.getValueOrABITypeAlignment( - PAL.getParamAlignment(paramIndex), Ty); + Align OptimalAlign = getOptimalAlignForParam(Ty); unsigned sz = DL.getTypeAllocSize(Ty); - O << "\t.param .align " << align.value() << " .b8 "; + O << "\t.param .align " << OptimalAlign.value() << " .b8 "; printParamName(I, paramIndex, O); O << "[" << sz << "]"; @@ -1491,10 +1499,11 @@ if (isABI || isKernelFunc) { // Just print .param .align .b8 .param[size]; - // = PAL.getparamalignment + // = optimal alignment for the element type; always multiple of + // PAL.getParamAlignment // size = typeallocsize of element type - Align align = - DL.getValueOrABITypeAlignment(PAL.getParamAlignment(paramIndex), ETy); + Align OptimalAlign = getOptimalAlignForParam(ETy); + // Work around a bug in ptxas. When PTX code takes address of // byval parameter with alignment < 4, ptxas generates code to // spill argument into memory. Alas on sm_50+ ptxas generates @@ -1506,10 +1515,10 @@ // TODO: this will need to be undone when we get to support multi-TU // device-side compilation as it breaks ABI compatibility with nvcc. // Hopefully ptxas bug is fixed by then. - if (!isKernelFunc && align < Align(4)) - align = Align(4); + if (!isKernelFunc && OptimalAlign < Align(4)) + OptimalAlign = Align(4); unsigned sz = DL.getTypeAllocSize(ETy); - O << "\t.param .align " << align.value() << " .b8 "; + O << "\t.param .align " << OptimalAlign.value() << " .b8 "; printParamName(I, paramIndex, O); O << "[" << sz << "]"; continue; Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.h =================================================================== --- llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -451,6 +451,16 @@ MachineFunction &MF, unsigned Intrinsic) const override; + /// getFunctionParamOptimizedAlign - since function arguments are passed via + /// .param space, we may want to increase their alignment in a way that + /// ensures that we can effectively vectorize their loads & stores. We can + /// increase alignment only if the function has internal or has private + /// linkage as for other linkage types callers may already rely on default + /// alignment. To allow using 128-bit vectorized loads/stores, this function + /// ensures that alignment is 16 or greater. + Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, + const DataLayout &DL) const; + /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type /// Used to guide target specific optimizations, like loop strength Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1304,6 +1304,7 @@ unsigned OIdx = 0; for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { + const Function *F = CB.getFunction(); Type *Ty = Args[i].Ty; if (!first) { O << ", "; @@ -1312,13 +1313,13 @@ if (!Outs[OIdx].Flags.isByVal()) { if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { - unsigned align = 0; + unsigned ParamAlign = 0; const CallInst *CallI = cast(&CB); // +1 because index 0 is reserved for return type alignment - if (!getAlign(*CallI, i + 1, align)) - align = DL.getABITypeAlignment(Ty); + if (!getAlign(*CallI, i + 1, ParamAlign)) + ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value(); unsigned sz = DL.getTypeAllocSize(Ty); - O << ".param .align " << align << " .b8 "; + O << ".param .align " << ParamAlign << " .b8 "; O << "_"; O << "[" << sz << "]"; // update the index for Outs @@ -1352,9 +1353,16 @@ continue; } - Align align = Outs[OIdx].Flags.getNonZeroByValAlign(); + Align ParamByValAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); + + // Try to increase alignment. This code matches logic in LowerCall when + // alignment increase is performed to increase vectorization options. + Type *ETy = Args[i].IndirectType; + Align AlignCandidate = getFunctionParamOptimizedAlign(F, ETy, DL); + ParamByValAlign = std::max(ParamByValAlign, AlignCandidate); + unsigned sz = Outs[OIdx].Flags.getByValSize(); - O << ".param .align " << align.value() << " .b8 "; + O << ".param .align " << ParamByValAlign.value() << " .b8 "; O << "_"; O << "[" << sz << "]"; } @@ -1403,12 +1411,15 @@ // Check for function alignment information if we found that the // ultimate target is a Function - if (DirectCallee) + if (DirectCallee) { if (getAlign(*DirectCallee, Idx, Alignment)) return Align(Alignment); + // If alignment information is not available, fall back to the + // default function param optimized type alignment + return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL); + } - // Call is indirect or alignment information is not available, fall back to - // the ABI type alignment + // Call is indirect, fall back to the ABI type alignment return DL.getABITypeAlign(Ty); } @@ -1569,18 +1580,26 @@ } // ByVal arguments + // TODO: remove code duplication when handling byval and non-byval cases. SmallVector VTs; SmallVector Offsets; - assert(Args[i].IndirectType && "byval arg must have indirect type"); - ComputePTXValueVTs(*this, DL, Args[i].IndirectType, VTs, &Offsets, 0); + Type *ETy = Args[i].IndirectType; + assert(ETy && "byval arg must have indirect type"); + ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, 0); // declare .param .align .b8 .param[]; unsigned sz = Outs[OIdx].Flags.getByValSize(); SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); + // The ByValAlign in the Outs[OIdx].Flags is alway set at this point, // so we don't need to worry about natural alignment or not. // See TargetLowering::LowerCallTo(). + Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); + + // Try to increase alignment to enhance vectorization options. + const Function *F = CB->getCalledFunction(); + Align AlignCandidate = getFunctionParamOptimizedAlign(F, ETy, DL); + ArgAlign = std::max(ArgAlign, AlignCandidate); // Enforce minumum alignment of 4 to work around ptxas miscompile // for sm_50+. See corresponding alignment adjustment in @@ -1594,29 +1613,67 @@ Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, DeclareParamOps); InFlag = Chain.getValue(1); + + auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); + SmallVector StoreOperands; for (unsigned j = 0, je = VTs.size(); j != je; ++j) { EVT elemtype = VTs[j]; int curOffset = Offsets[j]; - unsigned PartAlign = GreatestCommonDivisor64(ArgAlign.value(), curOffset); + Align PartAlign = commonAlignment(ArgAlign, curOffset); + + // New store. + if (VectorInfo[j] & PVF_FIRST) { + assert(StoreOperands.empty() && "Unfinished preceding store."); + StoreOperands.push_back(Chain); + StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32)); + StoreOperands.push_back(DAG.getConstant(curOffset, dl, MVT::i32)); + } + auto PtrVT = getPointerTy(DL); SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx], DAG.getConstant(curOffset, dl, PtrVT)); SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, MachinePointerInfo(), PartAlign); + if (elemtype.getSizeInBits() < 16) { + // Use 16-bit registers for small stores as it's the + // smallest general purpose register size supported by NVPTX. theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); } - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(curOffset, dl, MVT::i32), - theVal, InFlag }; - Chain = DAG.getMemIntrinsicNode( - NVPTXISD::StoreParam, dl, CopyParamVTs, CopyParamOps, elemtype, - MachinePointerInfo(), /* Align */ None, MachineMemOperand::MOStore); - InFlag = Chain.getValue(1); + // Record the value to store. + StoreOperands.push_back(theVal); + + if (VectorInfo[j] & PVF_LAST) { + unsigned NumElts = StoreOperands.size() - 3; + NVPTXISD::NodeType Op; + switch (NumElts) { + case 1: + Op = NVPTXISD::StoreParam; + break; + case 2: + Op = NVPTXISD::StoreParamV2; + break; + case 4: + Op = NVPTXISD::StoreParamV4; + break; + default: + llvm_unreachable("Invalid vector info."); + } + + StoreOperands.push_back(InFlag); + + Chain = DAG.getMemIntrinsicNode( + Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, + elemtype, MachinePointerInfo(), PartAlign, + MachineMemOperand::MOStore); + InFlag = Chain.getValue(1); + + // Cleanup. + StoreOperands.clear(); + } } + assert(StoreOperands.empty() && "Unfinished parameter store."); ++paramCount; } @@ -2617,7 +2674,8 @@ const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &dl, SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); + const MachineFunction &MF = DAG.getMachineFunction(); + const Function &F = MF.getFunction(); Type *RetTy = MF.getFunction().getReturnType(); bool isABI = (STI.getSmVersion() >= 20); @@ -2632,7 +2690,9 @@ assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); auto VectorInfo = VectorizePTXValueVTs( - VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlign(RetTy) : Align(1)); + VTs, Offsets, + RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL) + : Align(1)); // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than // 32-bits are sign extended or zero extended, depending on whether @@ -4252,6 +4312,55 @@ return false; } +/// getFunctionParamOptimizedAlign - since function arguments are passed via +/// .param space, we may want to increase their alignment in a way that +/// ensures that we can effectively vectorize their loads & stores. We can +/// increase alignment only if the function has internal or has private +/// linkage as for other linkage types callers may already rely on default +/// alignment. To allow using 128-bit vectorized loads/stores, this function +/// ensures that alignment is 16 or greater. +Align NVPTXTargetLowering::getFunctionParamOptimizedAlign( + const Function *F, Type *ArgTy, const DataLayout &DL) const { + const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value(); + + // If a function has linkage different from internal or private, we + // must use default ABI alignment as external users rely on it. + switch (F->getLinkage()) { + case GlobalValue::InternalLinkage: + case GlobalValue::PrivateLinkage: { + // Check that if a function has internal or private linkage + // it is not a kernel. +#ifndef NDEBUG + const NamedMDNode *NMDN = + F->getParent()->getNamedMetadata("nvvm.annotations"); + if (NMDN) { + for (const MDNode *MDN : NMDN->operands()) { + assert(MDN->getNumOperands() == 3); + + const Metadata *MD0 = MDN->getOperand(0).get(); + const auto *MDV0 = cast(MD0)->getValue(); + const auto *MDFn = cast(MDV0); + if (MDFn != F) + continue; + + const Metadata *MD1 = MDN->getOperand(1).get(); + const MDString *MDStr = cast(MD1); + if (MDStr->getString() != "kernel") + continue; + + const Metadata *MD2 = MDN->getOperand(2).get(); + const auto *MDV2 = cast(MD2)->getValue(); + assert(!cast(MDV2)->isZero()); + } + } +#endif + return Align(std::max(uint64_t(16), ABITypeAlign)); + } + default: + return Align(ABITypeAlign); + } +} + /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. /// Used to guide target specific optimizations, like loop strength reduction Index: llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -88,16 +88,17 @@ // cancel the addrspacecast pair this pass emits. //===----------------------------------------------------------------------===// +#include "MCTargetDesc/NVPTXBaseInfo.h" #include "NVPTX.h" #include "NVPTXTargetMachine.h" #include "NVPTXUtilities.h" -#include "MCTargetDesc/NVPTXBaseInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" +#include #define DEBUG_TYPE "nvptx-lower-args" @@ -226,6 +227,90 @@ [](Instruction *I) { I->eraseFromParent(); }); } +// Adjust alignment of arguments passed byval in .param address space. We can +// increase alignment of such arguments in a way that ensures that we can +// effectively vectorize their loads. We should also traverse all loads from +// byval pointer and adjust their alignment, if those were using known offset. +// Such alignment changes must be conformed with parameter store and load in +// NVPTXTargetLowering::LowerCall. +static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS, + const NVPTXTargetLowering *TLI) { + Function *Func = Arg->getParent(); + Type *StructType = Arg->getParamByValType(); + const DataLayout DL(Func->getParent()); + + uint64_t NewArgAlign = + TLI->getFunctionParamOptimizedAlign(Func, StructType, DL).value(); + uint64_t CurArgAlign = + Arg->getAttribute(Attribute::Alignment).getValueAsInt(); + + if (CurArgAlign >= NewArgAlign) + return; + + LLVM_DEBUG(dbgs() << "Try to use alignment " << NewArgAlign << " instead of " + << CurArgAlign << " for " << *Arg << '\n'); + + auto NewAlignAttr = + Attribute::get(Func->getContext(), Attribute::Alignment, NewArgAlign); + Arg->removeAttr(Attribute::Alignment); + Arg->addAttr(NewAlignAttr); + + struct Load { + LoadInst *Inst; + uint64_t Offset; + }; + + struct LoadContext { + Value *InitialVal; + uint64_t Offset; + }; + + SmallVector Loads; + std::queue Worklist; + Worklist.push({ArgInParamAS, 0}); + + while (!Worklist.empty()) { + LoadContext Ctx = Worklist.front(); + Worklist.pop(); + + for (User *CurUser : Ctx.InitialVal->users()) { + if (auto *I = dyn_cast(CurUser)) { + Loads.push_back({I, Ctx.Offset}); + continue; + } + + if (auto *I = dyn_cast(CurUser)) { + Worklist.push({I, Ctx.Offset}); + continue; + } + + if (auto *I = dyn_cast(CurUser)) { + APInt OffsetAccumulated = + APInt::getZero(DL.getIndexSizeInBits(ADDRESS_SPACE_PARAM)); + + if (!I->accumulateConstantOffset(DL, OffsetAccumulated)) + continue; + + uint64_t OffsetLimit = -1; + uint64_t Offset = OffsetAccumulated.getLimitedValue(OffsetLimit); + assert(Offset != OffsetLimit && "Expect Offset less than UINT64_MAX"); + + Worklist.push({I, Ctx.Offset + Offset}); + continue; + } + + llvm_unreachable("All users must be one of: load, " + "bitcast, getelementptr."); + } + } + + for (Load &CurLoad : Loads) { + Align NewLoadAlign(greatestCommonDivisor(NewArgAlign, CurLoad.Offset)); + Align CurLoadAlign(CurLoad.Inst->getAlign()); + CurLoad.Inst->setAlignment(std::max(NewLoadAlign, CurLoadAlign)); + } +} + void NVPTXLowerArgs::handleByValParam(Argument *Arg) { Function *Func = Arg->getParent(); Instruction *FirstInst = &(Func->getEntryBlock().front()); @@ -270,6 +355,16 @@ convertToParamAS(V, ArgInParamAS); }); LLVM_DEBUG(dbgs() << "No need to copy " << *Arg << "\n"); + + // Further optimizations require target lowering info. + if (!TM) + return; + + const auto *TLI = + cast(TM->getSubtargetImpl()->getTargetLowering()); + + adjustByValArgAlignment(Arg, ArgInParamAS, TLI); + return; } Index: llvm/test/CodeGen/NVPTX/param-vectorize-device.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/NVPTX/param-vectorize-device.ll @@ -0,0 +1,801 @@ +; RUN: llc < %s -mtriple=nvptx-unknown-unknown | FileCheck %s +; +; Check that parameters of a __device__ function with private or internal +; linkage called from a __global__ (kernel) function get increased alignment, +; and additional vectorization is performed on loads/stores with that +; parameters. +; +; Test IR is a minimized version of IR generated with the following command +; from the source code below: +; $ clang++ -O3 --cuda-gpu-arch=sm_35 -S -emit-llvm src.cu +; +; ---------------------------------------------------------------------------- +; #include +; +; struct St4x1 { uint32_t field[1]; }; +; struct St4x2 { uint32_t field[2]; }; +; struct St4x3 { uint32_t field[3]; }; +; struct St4x4 { uint32_t field[4]; }; +; struct St4x5 { uint32_t field[5]; }; +; struct St4x6 { uint32_t field[6]; }; +; struct St4x7 { uint32_t field[7]; }; +; struct St4x8 { uint32_t field[8]; }; +; struct St8x1 { uint64_t field[1]; }; +; struct St8x2 { uint64_t field[2]; }; +; struct St8x3 { uint64_t field[3]; }; +; struct St8x4 { uint64_t field[4]; }; +; +; #define DECLARE_CALLEE(StName) \ +; static __device__ __attribute__((noinline)) \ +; struct StName callee_##StName(struct StName in) { \ +; struct StName ret; \ +; const unsigned size = sizeof(ret.field) / sizeof(*ret.field); \ +; for (unsigned i = 0; i != size; ++i) \ +; ret.field[i] = in.field[i]; \ +; return ret; \ +; } \ + +; #define DECLARE_CALLER(StName) \ +; __global__ \ +; void caller_##StName(struct StName in, struct StName* ret) \ +; { \ +; *ret = callee_##StName(in); \ +; } \ +; +; #define DECLARE_CALL(StName) \ +; DECLARE_CALLEE(StName) \ +; DECLARE_CALLER(StName) \ +; +; DECLARE_CALL(St4x1) +; DECLARE_CALL(St4x2) +; DECLARE_CALL(St4x3) +; DECLARE_CALL(St4x4) +; DECLARE_CALL(St4x5) +; DECLARE_CALL(St4x6) +; DECLARE_CALL(St4x7) +; DECLARE_CALL(St4x8) +; DECLARE_CALL(St8x1) +; DECLARE_CALL(St8x2) +; DECLARE_CALL(St8x3) +; DECLARE_CALL(St8x4) +; ---------------------------------------------------------------------------- + +%struct.St4x1 = type { [1 x i32] } +%struct.St4x2 = type { [2 x i32] } +%struct.St4x3 = type { [3 x i32] } +%struct.St4x4 = type { [4 x i32] } +%struct.St4x5 = type { [5 x i32] } +%struct.St4x6 = type { [6 x i32] } +%struct.St4x7 = type { [7 x i32] } +%struct.St4x8 = type { [8 x i32] } +%struct.St8x1 = type { [1 x i64] } +%struct.St8x2 = type { [2 x i64] } +%struct.St8x3 = type { [3 x i64] } +%struct.St8x4 = type { [4 x i64] } + +; Section 1 - checking that: +; - function argument (including retval) vectorization is done with internal linkage; +; - caller and callee specify correct alignment for callee's params. + +define dso_local void @caller_St4x1(%struct.St4x1* nocapture noundef readonly byval(%struct.St4x1) align 4 %in, %struct.St4x1* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func caller_St4x1( + ; CHECK: .param .align 4 .b8 caller_St4x1_param_0[4], + ; CHECK: .param .b32 caller_St4x1_param_1 + ; CHECK: ) + ; CHECK: .param .b32 param0; + ; CHECK: st.param.b32 [param0+0], {{%r[0-9]+}}; + ; CHECK: .param .align 16 .b8 retval0[4]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St4x1, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+0]; + %1 = getelementptr inbounds %struct.St4x1, %struct.St4x1* %in, i64 0, i32 0, i64 0 + %2 = load i32, i32* %1, align 4 + %call = tail call fastcc [1 x i32] @callee_St4x1(i32 %2) + %.fca.0.extract = extractvalue [1 x i32] %call, 0 + %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x1, %struct.St4x1* %ret, i64 0, i32 0, i64 0 + store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4 + ret void +} + +define internal fastcc [1 x i32] @callee_St4x1(i32 %in.0.val) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[4]) + ; CHECK-LABEL: callee_St4x1( + ; CHECK-NEXT: .param .b32 callee_St4x1_param_0 + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [callee_St4x1_param_0]; + ; CHECK: st.param.b32 [func_retval0+0], [[R1]]; + ; CHECK-NEXT: ret; + %oldret = insertvalue [1 x i32] poison, i32 %in.0.val, 0 + ret [1 x i32] %oldret +} + +define dso_local void @caller_St4x2(%struct.St4x2* nocapture noundef readonly byval(%struct.St4x2) align 4 %in, %struct.St4x2* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func caller_St4x2( + ; CHECK: .param .align 4 .b8 caller_St4x2_param_0[8], + ; CHECK: .param .b32 caller_St4x2_param_1 + ; CHECK: ) + ; CHECK: .param .align 16 .b8 param0[8]; + ; CHECK: st.param.v2.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}}; + ; CHECK: .param .align 16 .b8 retval0[8]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St4x2, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; + %agg.tmp = alloca i64, align 8 + %tmpcast = bitcast i64* %agg.tmp to %struct.St4x2* + %1 = bitcast %struct.St4x2* %in to i64* + %2 = load i64, i64* %1, align 4 + store i64 %2, i64* %agg.tmp, align 8 + %call = tail call fastcc [2 x i32] @callee_St4x2(%struct.St4x2* noundef nonnull byval(%struct.St4x2) align 4 %tmpcast) + %.fca.0.extract = extractvalue [2 x i32] %call, 0 + %.fca.1.extract = extractvalue [2 x i32] %call, 1 + %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x2, %struct.St4x2* %ret, i64 0, i32 0, i64 0 + store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4 + %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %ret, i64 0, i32 0, i64 1 + store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx3, align 4 + ret void +} + +define internal fastcc [2 x i32] @callee_St4x2(%struct.St4x2* nocapture noundef readonly byval(%struct.St4x2) align 4 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[8]) + ; CHECK-LABEL: callee_St4x2( + ; CHECK-NEXT: .param .align 16 .b8 callee_St4x2_param_0[8] + ; CHECK: ld.param.v2.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x2_param_0]; + ; CHECK: st.param.v2.b32 [func_retval0+0], {[[R1]], [[R2]]}; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x2, %struct.St4x2* %in, i64 0, i32 0, i64 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %in, i64 0, i32 0, i64 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %3 = insertvalue [2 x i32] poison, i32 %1, 0 + %oldret = insertvalue [2 x i32] %3, i32 %2, 1 + ret [2 x i32] %oldret +} + +define dso_local void @caller_St4x3(%struct.St4x3* nocapture noundef readonly byval(%struct.St4x3) align 4 %in, %struct.St4x3* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func caller_St4x3( + ; CHECK: .param .align 4 .b8 caller_St4x3_param_0[12], + ; CHECK: .param .b32 caller_St4x3_param_1 + ; CHECK: ) + ; CHECK: .param .align 16 .b8 param0[12]; + ; CHECK: st.param.v2.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}}; + ; CHECK: st.param.b32 [param0+8], {{%r[0-9]+}}; + ; CHECK: .param .align 16 .b8 retval0[12]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St4x3, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; + ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+8]; + %call = tail call fastcc [3 x i32] @callee_St4x3(%struct.St4x3* noundef nonnull byval(%struct.St4x3) align 4 %in) + %.fca.0.extract = extractvalue [3 x i32] %call, 0 + %.fca.1.extract = extractvalue [3 x i32] %call, 1 + %.fca.2.extract = extractvalue [3 x i32] %call, 2 + %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 0 + store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4 + %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 1 + store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx2, align 4 + %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 2 + store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx4, align 4 + ret void +} + + +define internal fastcc [3 x i32] @callee_St4x3(%struct.St4x3* nocapture noundef readonly byval(%struct.St4x3) align 4 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[12]) + ; CHECK-LABEL: callee_St4x3( + ; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12] + ; CHECK: ld.param.v2.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0]; + ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [callee_St4x3_param_0+8]; + ; CHECK: st.param.v2.b32 [func_retval0+0], {[[R1]], [[R2]]}; + ; CHECK: st.param.b32 [func_retval0+8], [[R3]]; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %4 = insertvalue [3 x i32] poison, i32 %1, 0 + %5 = insertvalue [3 x i32] %4, i32 %2, 1 + %oldret = insertvalue [3 x i32] %5, i32 %3, 2 + ret [3 x i32] %oldret +} + + +define dso_local void @caller_St4x4(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in, %struct.St4x4* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func caller_St4x4( + ; CHECK: .param .align 4 .b8 caller_St4x4_param_0[16], + ; CHECK: .param .b32 caller_St4x4_param_1 + ; CHECK: ) + ; CHECK: .param .align 16 .b8 param0[16]; + ; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; + ; CHECK: .param .align 16 .b8 retval0[16]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St4x4, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; + %call = tail call fastcc [4 x i32] @callee_St4x4(%struct.St4x4* noundef nonnull byval(%struct.St4x4) align 4 %in) + %.fca.0.extract = extractvalue [4 x i32] %call, 0 + %.fca.1.extract = extractvalue [4 x i32] %call, 1 + %.fca.2.extract = extractvalue [4 x i32] %call, 2 + %.fca.3.extract = extractvalue [4 x i32] %call, 3 + %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 0 + store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4 + %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 1 + store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx3, align 4 + %ref.tmp.sroa.5.0..sroa_idx5 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 2 + store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx5, align 4 + %ref.tmp.sroa.6.0..sroa_idx7 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 3 + store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx7, align 4 + ret void +} + + +define internal fastcc [4 x i32] @callee_St4x4(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[16]) + ; CHECK-LABEL: callee_St4x4( + ; CHECK-NEXT: .param .align 16 .b8 callee_St4x4_param_0[16] + ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_param_0]; + ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %5 = insertvalue [4 x i32] poison, i32 %1, 0 + %6 = insertvalue [4 x i32] %5, i32 %2, 1 + %7 = insertvalue [4 x i32] %6, i32 %3, 2 + %oldret = insertvalue [4 x i32] %7, i32 %4, 3 + ret [4 x i32] %oldret +} + + +define dso_local void @caller_St4x5(%struct.St4x5* nocapture noundef readonly byval(%struct.St4x5) align 4 %in, %struct.St4x5* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func caller_St4x5( + ; CHECK: .param .align 4 .b8 caller_St4x5_param_0[20], + ; CHECK: .param .b32 caller_St4x5_param_1 + ; CHECK: ) + ; CHECK: .param .align 16 .b8 param0[20]; + ; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; + ; CHECK: st.param.b32 [param0+16], {{%r[0-9]+}}; + ; CHECK: .param .align 16 .b8 retval0[20]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St4x5, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; + ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+16]; + %call = tail call fastcc [5 x i32] @callee_St4x5(%struct.St4x5* noundef nonnull byval(%struct.St4x5) align 4 %in) + %.fca.0.extract = extractvalue [5 x i32] %call, 0 + %.fca.1.extract = extractvalue [5 x i32] %call, 1 + %.fca.2.extract = extractvalue [5 x i32] %call, 2 + %.fca.3.extract = extractvalue [5 x i32] %call, 3 + %.fca.4.extract = extractvalue [5 x i32] %call, 4 + %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 0 + store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4 + %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 1 + store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx3, align 4 + %ref.tmp.sroa.5.0..sroa_idx5 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 2 + store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx5, align 4 + %ref.tmp.sroa.6.0..sroa_idx7 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 3 + store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx7, align 4 + %ref.tmp.sroa.7.0..sroa_idx9 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 4 + store i32 %.fca.4.extract, i32* %ref.tmp.sroa.7.0..sroa_idx9, align 4 + ret void +} + + +define internal fastcc [5 x i32] @callee_St4x5(%struct.St4x5* nocapture noundef readonly byval(%struct.St4x5) align 4 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[20]) + ; CHECK-LABEL: callee_St4x5( + ; CHECK-NEXT: .param .align 16 .b8 callee_St4x5_param_0[20] + ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x5_param_0]; + ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [callee_St4x5_param_0+16]; + ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; + ; CHECK: st.param.b32 [func_retval0+16], [[R5]]; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 4 + %5 = load i32, i32* %arrayidx.4, align 4 + %6 = insertvalue [5 x i32] poison, i32 %1, 0 + %7 = insertvalue [5 x i32] %6, i32 %2, 1 + %8 = insertvalue [5 x i32] %7, i32 %3, 2 + %9 = insertvalue [5 x i32] %8, i32 %4, 3 + %oldret = insertvalue [5 x i32] %9, i32 %5, 4 + ret [5 x i32] %oldret +} + + +define dso_local void @caller_St4x6(%struct.St4x6* nocapture noundef readonly byval(%struct.St4x6) align 4 %in, %struct.St4x6* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func caller_St4x6( + ; CHECK: .param .align 4 .b8 caller_St4x6_param_0[24], + ; CHECK: .param .b32 caller_St4x6_param_1 + ; CHECK: ) + ; CHECK: .param .align 16 .b8 param0[24]; + ; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; + ; CHECK: st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}}; + ; CHECK: .param .align 16 .b8 retval0[24]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St4x6, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; + ; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16]; + %call = tail call fastcc [6 x i32] @callee_St4x6(%struct.St4x6* noundef nonnull byval(%struct.St4x6) align 4 %in) + %.fca.0.extract = extractvalue [6 x i32] %call, 0 + %.fca.1.extract = extractvalue [6 x i32] %call, 1 + %.fca.2.extract = extractvalue [6 x i32] %call, 2 + %.fca.3.extract = extractvalue [6 x i32] %call, 3 + %.fca.4.extract = extractvalue [6 x i32] %call, 4 + %.fca.5.extract = extractvalue [6 x i32] %call, 5 + %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 0 + store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4 + %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 1 + store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx2, align 4 + %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 2 + store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx4, align 4 + %ref.tmp.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 3 + store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx6, align 4 + %ref.tmp.sroa.7.0..sroa_idx8 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 4 + store i32 %.fca.4.extract, i32* %ref.tmp.sroa.7.0..sroa_idx8, align 4 + %ref.tmp.sroa.8.0..sroa_idx10 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 5 + store i32 %.fca.5.extract, i32* %ref.tmp.sroa.8.0..sroa_idx10, align 4 + ret void +} + + +define internal fastcc [6 x i32] @callee_St4x6(%struct.St4x6* nocapture noundef readonly byval(%struct.St4x6) align 4 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[24]) + ; CHECK-LABEL: callee_St4x6( + ; CHECK-NEXT: .param .align 16 .b8 callee_St4x6_param_0[24] + ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x6_param_0]; + ; CHECK: ld.param.v2.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x6_param_0+16]; + ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; + ; CHECK: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]}; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 4 + %5 = load i32, i32* %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 5 + %6 = load i32, i32* %arrayidx.5, align 4 + %7 = insertvalue [6 x i32] poison, i32 %1, 0 + %8 = insertvalue [6 x i32] %7, i32 %2, 1 + %9 = insertvalue [6 x i32] %8, i32 %3, 2 + %10 = insertvalue [6 x i32] %9, i32 %4, 3 + %11 = insertvalue [6 x i32] %10, i32 %5, 4 + %oldret = insertvalue [6 x i32] %11, i32 %6, 5 + ret [6 x i32] %oldret +} + + +define dso_local void @caller_St4x7(%struct.St4x7* nocapture noundef readonly byval(%struct.St4x7) align 4 %in, %struct.St4x7* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func caller_St4x7( + ; CHECK: .param .align 4 .b8 caller_St4x7_param_0[28], + ; CHECK: .param .b32 caller_St4x7_param_1 + ; CHECK: ) + ; CHECK: .param .align 16 .b8 param0[28]; + ; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; + ; CHECK: st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}}; + ; CHECK: st.param.b32 [param0+24], {{%r[0-9]+}}; + ; CHECK: .param .align 16 .b8 retval0[28]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St4x7, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; + ; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16]; + ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+24]; + %call = tail call fastcc [7 x i32] @callee_St4x7(%struct.St4x7* noundef nonnull byval(%struct.St4x7) align 4 %in) + %.fca.0.extract = extractvalue [7 x i32] %call, 0 + %.fca.1.extract = extractvalue [7 x i32] %call, 1 + %.fca.2.extract = extractvalue [7 x i32] %call, 2 + %.fca.3.extract = extractvalue [7 x i32] %call, 3 + %.fca.4.extract = extractvalue [7 x i32] %call, 4 + %.fca.5.extract = extractvalue [7 x i32] %call, 5 + %.fca.6.extract = extractvalue [7 x i32] %call, 6 + %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 0 + store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4 + %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 1 + store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx2, align 4 + %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 2 + store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx4, align 4 + %ref.tmp.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 3 + store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx6, align 4 + %ref.tmp.sroa.7.0..sroa_idx8 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 4 + store i32 %.fca.4.extract, i32* %ref.tmp.sroa.7.0..sroa_idx8, align 4 + %ref.tmp.sroa.8.0..sroa_idx10 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 5 + store i32 %.fca.5.extract, i32* %ref.tmp.sroa.8.0..sroa_idx10, align 4 + %ref.tmp.sroa.9.0..sroa_idx12 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 6 + store i32 %.fca.6.extract, i32* %ref.tmp.sroa.9.0..sroa_idx12, align 4 + ret void +} + + +define internal fastcc [7 x i32] @callee_St4x7(%struct.St4x7* nocapture noundef readonly byval(%struct.St4x7) align 4 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[28]) + ; CHECK-LABEL: callee_St4x7( + ; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28] + ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0]; + ; CHECK: ld.param.v2.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16]; + ; CHECK: ld.param.u32 [[R7:%r[0-9]+]], [callee_St4x7_param_0+24]; + ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; + ; CHECK: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]}; + ; CHECK: st.param.b32 [func_retval0+24], [[R7]]; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 4 + %5 = load i32, i32* %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 5 + %6 = load i32, i32* %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 6 + %7 = load i32, i32* %arrayidx.6, align 4 + %8 = insertvalue [7 x i32] poison, i32 %1, 0 + %9 = insertvalue [7 x i32] %8, i32 %2, 1 + %10 = insertvalue [7 x i32] %9, i32 %3, 2 + %11 = insertvalue [7 x i32] %10, i32 %4, 3 + %12 = insertvalue [7 x i32] %11, i32 %5, 4 + %13 = insertvalue [7 x i32] %12, i32 %6, 5 + %oldret = insertvalue [7 x i32] %13, i32 %7, 6 + ret [7 x i32] %oldret +} + + +define dso_local void @caller_St4x8(%struct.St4x8* nocapture noundef readonly byval(%struct.St4x8) align 4 %in, %struct.St4x8* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func caller_St4x8( + ; CHECK: .param .align 4 .b8 caller_St4x8_param_0[32], + ; CHECK: .param .b32 caller_St4x8_param_1 + ; CHECK: ) + ; CHECK: .param .align 16 .b8 param0[32]; + ; CHECK: st.param.v4.b32 [param0+0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; + ; CHECK: st.param.v4.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; + ; CHECK: .param .align 16 .b8 retval0[32]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St4x8, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+0]; + ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16]; + %call = tail call fastcc [8 x i32] @callee_St4x8(%struct.St4x8* noundef nonnull byval(%struct.St4x8) align 4 %in) + %.fca.0.extract = extractvalue [8 x i32] %call, 0 + %.fca.1.extract = extractvalue [8 x i32] %call, 1 + %.fca.2.extract = extractvalue [8 x i32] %call, 2 + %.fca.3.extract = extractvalue [8 x i32] %call, 3 + %.fca.4.extract = extractvalue [8 x i32] %call, 4 + %.fca.5.extract = extractvalue [8 x i32] %call, 5 + %.fca.6.extract = extractvalue [8 x i32] %call, 6 + %.fca.7.extract = extractvalue [8 x i32] %call, 7 + %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 0 + store i32 %.fca.0.extract, i32* %ref.tmp.sroa.0.0..sroa_idx, align 4 + %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 1 + store i32 %.fca.1.extract, i32* %ref.tmp.sroa.4.0..sroa_idx2, align 4 + %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 2 + store i32 %.fca.2.extract, i32* %ref.tmp.sroa.5.0..sroa_idx4, align 4 + %ref.tmp.sroa.6.0..sroa_idx6 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 3 + store i32 %.fca.3.extract, i32* %ref.tmp.sroa.6.0..sroa_idx6, align 4 + %ref.tmp.sroa.7.0..sroa_idx8 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 4 + store i32 %.fca.4.extract, i32* %ref.tmp.sroa.7.0..sroa_idx8, align 4 + %ref.tmp.sroa.8.0..sroa_idx10 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 5 + store i32 %.fca.5.extract, i32* %ref.tmp.sroa.8.0..sroa_idx10, align 4 + %ref.tmp.sroa.9.0..sroa_idx12 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 6 + store i32 %.fca.6.extract, i32* %ref.tmp.sroa.9.0..sroa_idx12, align 4 + %ref.tmp.sroa.10.0..sroa_idx14 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 7 + store i32 %.fca.7.extract, i32* %ref.tmp.sroa.10.0..sroa_idx14, align 4 + ret void +} + + +define internal fastcc [8 x i32] @callee_St4x8(%struct.St4x8* nocapture noundef readonly byval(%struct.St4x8) align 4 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[32]) + ; CHECK-LABEL: callee_St4x8( + ; CHECK-NEXT: .param .align 16 .b8 callee_St4x8_param_0[32] + ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x8_param_0]; + ; CHECK: ld.param.v4.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], [[R8:%r[0-9]+]]}, [callee_St4x8_param_0+16]; + ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; + ; CHECK: st.param.v4.b32 [func_retval0+16], {[[R5]], [[R6]], [[R7]], [[R8]]}; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 4 + %5 = load i32, i32* %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 5 + %6 = load i32, i32* %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 6 + %7 = load i32, i32* %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 7 + %8 = load i32, i32* %arrayidx.7, align 4 + %9 = insertvalue [8 x i32] poison, i32 %1, 0 + %10 = insertvalue [8 x i32] %9, i32 %2, 1 + %11 = insertvalue [8 x i32] %10, i32 %3, 2 + %12 = insertvalue [8 x i32] %11, i32 %4, 3 + %13 = insertvalue [8 x i32] %12, i32 %5, 4 + %14 = insertvalue [8 x i32] %13, i32 %6, 5 + %15 = insertvalue [8 x i32] %14, i32 %7, 6 + %oldret = insertvalue [8 x i32] %15, i32 %8, 7 + ret [8 x i32] %oldret +} + + +define dso_local void @caller_St8x1(%struct.St8x1* nocapture noundef readonly byval(%struct.St8x1) align 8 %in, %struct.St8x1* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func caller_St8x1( + ; CHECK: .param .align 8 .b8 caller_St8x1_param_0[8], + ; CHECK: .param .b32 caller_St8x1_param_1 + ; CHECK: ) + ; CHECK: .param .b64 param0; + ; CHECK: st.param.b64 [param0+0], {{%rd[0-9]+}}; + ; CHECK: .param .align 16 .b8 retval0[8]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St8x1, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0+0]; + %1 = getelementptr inbounds %struct.St8x1, %struct.St8x1* %in, i64 0, i32 0, i64 0 + %2 = load i64, i64* %1, align 8 + %call = tail call fastcc [1 x i64] @callee_St8x1(i64 %2) + %.fca.0.extract = extractvalue [1 x i64] %call, 0 + %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St8x1, %struct.St8x1* %ret, i64 0, i32 0, i64 0 + store i64 %.fca.0.extract, i64* %ref.tmp.sroa.0.0..sroa_idx, align 8 + ret void +} + + +define internal fastcc [1 x i64] @callee_St8x1(i64 %in.0.val) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[8]) + ; CHECK-LABEL: callee_St8x1( + ; CHECK-NEXT: .param .b64 callee_St8x1_param_0 + ; CHECK: ld.param.u64 [[RD1:%rd[0-9]+]], [callee_St8x1_param_0]; + ; CHECK: st.param.b64 [func_retval0+0], [[RD1]]; + ; CHECK-NEXT: ret; + %oldret = insertvalue [1 x i64] poison, i64 %in.0.val, 0 + ret [1 x i64] %oldret +} + + +define dso_local void @caller_St8x2(%struct.St8x2* nocapture noundef readonly byval(%struct.St8x2) align 8 %in, %struct.St8x2* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func caller_St8x2( + ; CHECK: .param .align 8 .b8 caller_St8x2_param_0[16], + ; CHECK: .param .b32 caller_St8x2_param_1 + ; CHECK: ) + ; CHECK: .param .align 16 .b8 param0[16]; + ; CHECK: st.param.v2.b64 [param0+0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; + ; CHECK: .param .align 16 .b8 retval0[16]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St8x2, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+0]; + %call = tail call fastcc [2 x i64] @callee_St8x2(%struct.St8x2* noundef nonnull byval(%struct.St8x2) align 8 %in) + %.fca.0.extract = extractvalue [2 x i64] %call, 0 + %.fca.1.extract = extractvalue [2 x i64] %call, 1 + %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St8x2, %struct.St8x2* %ret, i64 0, i32 0, i64 0 + store i64 %.fca.0.extract, i64* %ref.tmp.sroa.0.0..sroa_idx, align 8 + %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %ret, i64 0, i32 0, i64 1 + store i64 %.fca.1.extract, i64* %ref.tmp.sroa.4.0..sroa_idx3, align 8 + ret void +} + + +define internal fastcc [2 x i64] @callee_St8x2(%struct.St8x2* nocapture noundef readonly byval(%struct.St8x2) align 8 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[16]) + ; CHECK-LABEL: callee_St8x2( + ; CHECK-NEXT: .param .align 16 .b8 callee_St8x2_param_0[16] + ; CHECK: ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x2_param_0]; + ; CHECK: st.param.v2.b64 [func_retval0+0], {[[RD1]], [[RD2]]}; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St8x2, %struct.St8x2* %in, i64 0, i32 0, i64 0 + %1 = load i64, i64* %arrayidx, align 8 + %arrayidx.1 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %in, i64 0, i32 0, i64 1 + %2 = load i64, i64* %arrayidx.1, align 8 + %3 = insertvalue [2 x i64] poison, i64 %1, 0 + %oldret = insertvalue [2 x i64] %3, i64 %2, 1 + ret [2 x i64] %oldret +} + + +define dso_local void @caller_St8x3(%struct.St8x3* nocapture noundef readonly byval(%struct.St8x3) align 8 %in, %struct.St8x3* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func caller_St8x3( + ; CHECK: .param .align 8 .b8 caller_St8x3_param_0[24], + ; CHECK: .param .b32 caller_St8x3_param_1 + ; CHECK: ) + ; CHECK: .param .align 16 .b8 param0[24]; + ; CHECK: st.param.v2.b64 [param0+0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; + ; CHECK: st.param.b64 [param0+16], {{%rd[0-9]+}}; + ; CHECK: .param .align 16 .b8 retval0[24]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St8x3, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+0]; + ; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0+16]; + %call = tail call fastcc [3 x i64] @callee_St8x3(%struct.St8x3* noundef nonnull byval(%struct.St8x3) align 8 %in) + %.fca.0.extract = extractvalue [3 x i64] %call, 0 + %.fca.1.extract = extractvalue [3 x i64] %call, 1 + %.fca.2.extract = extractvalue [3 x i64] %call, 2 + %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 0 + store i64 %.fca.0.extract, i64* %ref.tmp.sroa.0.0..sroa_idx, align 8 + %ref.tmp.sroa.4.0..sroa_idx2 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 1 + store i64 %.fca.1.extract, i64* %ref.tmp.sroa.4.0..sroa_idx2, align 8 + %ref.tmp.sroa.5.0..sroa_idx4 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 2 + store i64 %.fca.2.extract, i64* %ref.tmp.sroa.5.0..sroa_idx4, align 8 + ret void +} + + +define internal fastcc [3 x i64] @callee_St8x3(%struct.St8x3* nocapture noundef readonly byval(%struct.St8x3) align 8 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[24]) + ; CHECK-LABEL: callee_St8x3( + ; CHECK-NEXT: .param .align 16 .b8 callee_St8x3_param_0[24] + ; CHECK: ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x3_param_0]; + ; CHECK: ld.param.u64 [[RD3:%rd[0-9]+]], [callee_St8x3_param_0+16]; + ; CHECK: st.param.v2.b64 [func_retval0+0], {[[RD1]], [[RD2]]}; + ; CHECK: st.param.b64 [func_retval0+16], [[RD3]]; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 0 + %1 = load i64, i64* %arrayidx, align 8 + %arrayidx.1 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 1 + %2 = load i64, i64* %arrayidx.1, align 8 + %arrayidx.2 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 2 + %3 = load i64, i64* %arrayidx.2, align 8 + %4 = insertvalue [3 x i64] poison, i64 %1, 0 + %5 = insertvalue [3 x i64] %4, i64 %2, 1 + %oldret = insertvalue [3 x i64] %5, i64 %3, 2 + ret [3 x i64] %oldret +} + + +define dso_local void @caller_St8x4(%struct.St8x4* nocapture noundef readonly byval(%struct.St8x4) align 8 %in, %struct.St8x4* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func caller_St8x4( + ; CHECK: .param .align 8 .b8 caller_St8x4_param_0[32], + ; CHECK: .param .b32 caller_St8x4_param_1 + ; CHECK: ) + ; CHECK: .param .align 16 .b8 param0[32]; + ; CHECK: st.param.v2.b64 [param0+0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; + ; CHECK: st.param.v2.b64 [param0+16], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; + ; CHECK: .param .align 16 .b8 retval0[32]; + ; CHECK: call.uni (retval0), + ; CHECK-NEXT: callee_St8x4, + ; CHECK-NEXT: ( + ; CHECK-NEXT: param0 + ; CHECK-NEXT: ); + ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+0]; + ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+16]; + %call = tail call fastcc [4 x i64] @callee_St8x4(%struct.St8x4* noundef nonnull byval(%struct.St8x4) align 8 %in) + %.fca.0.extract = extractvalue [4 x i64] %call, 0 + %.fca.1.extract = extractvalue [4 x i64] %call, 1 + %.fca.2.extract = extractvalue [4 x i64] %call, 2 + %.fca.3.extract = extractvalue [4 x i64] %call, 3 + %ref.tmp.sroa.0.0..sroa_idx = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 0 + store i64 %.fca.0.extract, i64* %ref.tmp.sroa.0.0..sroa_idx, align 8 + %ref.tmp.sroa.4.0..sroa_idx3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 1 + store i64 %.fca.1.extract, i64* %ref.tmp.sroa.4.0..sroa_idx3, align 8 + %ref.tmp.sroa.5.0..sroa_idx5 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 2 + store i64 %.fca.2.extract, i64* %ref.tmp.sroa.5.0..sroa_idx5, align 8 + %ref.tmp.sroa.6.0..sroa_idx7 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 3 + store i64 %.fca.3.extract, i64* %ref.tmp.sroa.6.0..sroa_idx7, align 8 + ret void +} + + +define internal fastcc [4 x i64] @callee_St8x4(%struct.St8x4* nocapture noundef readonly byval(%struct.St8x4) align 8 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[32]) + ; CHECK-LABEL: callee_St8x4( + ; CHECK-NEXT: .param .align 16 .b8 callee_St8x4_param_0[32] + ; CHECK: ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x4_param_0]; + ; CHECK: ld.param.v2.u64 {[[RD3:%rd[0-9]+]], [[RD4:%rd[0-9]+]]}, [callee_St8x4_param_0+16]; + ; CHECK: st.param.v2.b64 [func_retval0+0], {[[RD1]], [[RD2]]}; + ; CHECK: st.param.v2.b64 [func_retval0+16], {[[RD3]], [[RD4]]}; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 0 + %1 = load i64, i64* %arrayidx, align 8 + %arrayidx.1 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 1 + %2 = load i64, i64* %arrayidx.1, align 8 + %arrayidx.2 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 2 + %3 = load i64, i64* %arrayidx.2, align 8 + %arrayidx.3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 3 + %4 = load i64, i64* %arrayidx.3, align 8 + %5 = insertvalue [4 x i64] poison, i64 %1, 0 + %6 = insertvalue [4 x i64] %5, i64 %2, 1 + %7 = insertvalue [4 x i64] %6, i64 %3, 2 + %oldret = insertvalue [4 x i64] %7, i64 %4, 3 + ret [4 x i64] %oldret +} + +; Section 2 - checking that function argument (including retval) vectorization is done with private linkage. + +define private fastcc [4 x i32] @callee_St4x4_private(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in) { + ; CHECK: .func (.param .align 16 .b8 func_retval0[16]) + ; CHECK-LABEL: callee_St4x4_private( + ; CHECK-NEXT: .param .align 16 .b8 callee_St4x4_private_param_0[16] + ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_private_param_0]; + ; CHECK: st.param.v4.b32 [func_retval0+0], {[[R1]], [[R2]], [[R3]], [[R4]]}; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %5 = insertvalue [4 x i32] poison, i32 %1, 0 + %6 = insertvalue [4 x i32] %5, i32 %2, 1 + %7 = insertvalue [4 x i32] %6, i32 %3, 2 + %oldret = insertvalue [4 x i32] %7, i32 %4, 3 + ret [4 x i32] %oldret +} + +; Section 3 - checking that function argument (including retval) vectorization +; is NOT done with linkage types other than internal and private. + +define external fastcc [4 x i32] @callee_St4x4_external(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in) { + ; CHECK: .func (.param .align 4 .b8 func_retval0[16]) + ; CHECK-LABEL: callee_St4x4_external( + ; CHECK-NEXT: .param .align 4 .b8 callee_St4x4_external_param_0[16] + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [callee_St4x4_external_param_0]; + ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [callee_St4x4_external_param_0+4]; + ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [callee_St4x4_external_param_0+8]; + ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [callee_St4x4_external_param_0+12]; + ; CHECK: st.param.b32 [func_retval0+0], [[R1]]; + ; CHECK: st.param.b32 [func_retval0+4], [[R2]]; + ; CHECK: st.param.b32 [func_retval0+8], [[R3]]; + ; CHECK: st.param.b32 [func_retval0+12], [[R4]]; + ; CHECK-NEXT: ret; + %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %5 = insertvalue [4 x i32] poison, i32 %1, 0 + %6 = insertvalue [4 x i32] %5, i32 %2, 1 + %7 = insertvalue [4 x i32] %6, i32 %3, 2 + %oldret = insertvalue [4 x i32] %7, i32 %4, 3 + ret [4 x i32] %oldret +} Index: llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll @@ -0,0 +1,456 @@ +; RUN: llc < %s -mtriple=nvptx-unknown-unknown | FileCheck %s +; +; Check that parameters of a __global__ (kernel) function do not get increased +; alignment, and no additional vectorization is performed on loads/stores with +; that parameters. +; +; Test IR is a minimized version of IR generated with the following command +; from the source code below: +; $ clang++ -O3 --cuda-gpu-arch=sm_35 -S -emit-llvm src.cu +; +; ---------------------------------------------------------------------------- +; #include +; +; struct St4x1 { uint32_t field[1]; }; +; struct St4x2 { uint32_t field[2]; }; +; struct St4x3 { uint32_t field[3]; }; +; struct St4x4 { uint32_t field[4]; }; +; struct St4x5 { uint32_t field[5]; }; +; struct St4x6 { uint32_t field[6]; }; +; struct St4x7 { uint32_t field[7]; }; +; struct St4x8 { uint32_t field[8]; }; +; struct St8x1 { uint64_t field[1]; }; +; struct St8x2 { uint64_t field[2]; }; +; struct St8x3 { uint64_t field[3]; }; +; struct St8x4 { uint64_t field[4]; }; +; +; #define DECLARE_FUNCTION(StName) \ +; static __global__ __attribute__((noinline)) \ +; void foo_##StName(struct StName in, struct StName* ret) { \ +; const unsigned size = sizeof(ret->field) / sizeof(*ret->field); \ +; for (unsigned i = 0; i != size; ++i) \ +; ret->field[i] = in.field[i]; \ +; } \ +; +; DECLARE_FUNCTION(St4x1) +; DECLARE_FUNCTION(St4x2) +; DECLARE_FUNCTION(St4x3) +; DECLARE_FUNCTION(St4x4) +; DECLARE_FUNCTION(St4x5) +; DECLARE_FUNCTION(St4x6) +; DECLARE_FUNCTION(St4x7) +; DECLARE_FUNCTION(St4x8) +; DECLARE_FUNCTION(St8x1) +; DECLARE_FUNCTION(St8x2) +; DECLARE_FUNCTION(St8x3) +; DECLARE_FUNCTION(St8x4) +; ---------------------------------------------------------------------------- + +%struct.St4x1 = type { [1 x i32] } +%struct.St4x2 = type { [2 x i32] } +%struct.St4x3 = type { [3 x i32] } +%struct.St4x4 = type { [4 x i32] } +%struct.St4x5 = type { [5 x i32] } +%struct.St4x6 = type { [6 x i32] } +%struct.St4x7 = type { [7 x i32] } +%struct.St4x8 = type { [8 x i32] } +%struct.St8x1 = type { [1 x i64] } +%struct.St8x2 = type { [2 x i64] } +%struct.St8x3 = type { [3 x i64] } +%struct.St8x4 = type { [4 x i64] } + +define dso_local void @foo_St4x1(%struct.St4x1* nocapture noundef readonly byval(%struct.St4x1) align 4 %in, %struct.St4x1* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func foo_St4x1( + ; CHECK: .param .align 4 .b8 foo_St4x1_param_0[4], + ; CHECK: .param .b32 foo_St4x1_param_1 + ; CHECK: ) + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x1_param_1]; + ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x1_param_0]; + ; CHECK: st.u32 [[[R1]]], [[R2]]; + ; CHECK: ret; + %arrayidx = getelementptr inbounds %struct.St4x1, %struct.St4x1* %in, i64 0, i32 0, i64 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds %struct.St4x1, %struct.St4x1* %ret, i64 0, i32 0, i64 0 + store i32 %1, i32* %arrayidx3, align 4 + ret void +} + +define dso_local void @foo_St4x2(%struct.St4x2* nocapture noundef readonly byval(%struct.St4x2) align 4 %in, %struct.St4x2* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func foo_St4x2( + ; CHECK: .param .align 4 .b8 foo_St4x2_param_0[8], + ; CHECK: .param .b32 foo_St4x2_param_1 + ; CHECK: ) + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x2_param_1]; + ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x2_param_0]; + ; CHECK: st.u32 [[[R1]]], [[R2]]; + ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x2_param_0+4]; + ; CHECK: st.u32 [[[R1]]+4], [[R3]]; + ; CHECK: ret; + %arrayidx = getelementptr inbounds %struct.St4x2, %struct.St4x2* %in, i64 0, i32 0, i64 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %ret, i64 0, i32 0, i64 0 + store i32 %1, i32* %arrayidx3, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %in, i64 0, i32 0, i64 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx3.1 = getelementptr inbounds %struct.St4x2, %struct.St4x2* %ret, i64 0, i32 0, i64 1 + store i32 %2, i32* %arrayidx3.1, align 4 + ret void +} + +define dso_local void @foo_St4x3(%struct.St4x3* nocapture noundef readonly byval(%struct.St4x3) align 4 %in, %struct.St4x3* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func foo_St4x3( + ; CHECK: .param .align 4 .b8 foo_St4x3_param_0[12], + ; CHECK: .param .b32 foo_St4x3_param_1 + ; CHECK: ) + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x3_param_1]; + ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x3_param_0]; + ; CHECK: st.u32 [[[R1]]], [[R2]]; + ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x3_param_0+4]; + ; CHECK: st.u32 [[[R1]]+4], [[R3]]; + ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x3_param_0+8]; + ; CHECK: st.u32 [[[R1]]+8], [[R4]]; + ; CHECK: ret; + %arrayidx = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 0 + store i32 %1, i32* %arrayidx3, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx3.1 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 1 + store i32 %2, i32* %arrayidx3.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %in, i64 0, i32 0, i64 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx3.2 = getelementptr inbounds %struct.St4x3, %struct.St4x3* %ret, i64 0, i32 0, i64 2 + store i32 %3, i32* %arrayidx3.2, align 4 + ret void +} + +define dso_local void @foo_St4x4(%struct.St4x4* nocapture noundef readonly byval(%struct.St4x4) align 4 %in, %struct.St4x4* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func foo_St4x4( + ; CHECK: .param .align 4 .b8 foo_St4x4_param_0[16], + ; CHECK: .param .b32 foo_St4x4_param_1 + ; CHECK: ) + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x4_param_1]; + ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x4_param_0]; + ; CHECK: st.u32 [[[R1]]], [[R2]]; + ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x4_param_0+4]; + ; CHECK: st.u32 [[[R1]]+4], [[R3]]; + ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x4_param_0+8]; + ; CHECK: st.u32 [[[R1]]+8], [[R4]]; + ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x4_param_0+12]; + ; CHECK: st.u32 [[[R1]]+12], [[R5]]; + ; CHECK: ret; + %arrayidx = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 0 + store i32 %1, i32* %arrayidx3, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx3.1 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 1 + store i32 %2, i32* %arrayidx3.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx3.2 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 2 + store i32 %3, i32* %arrayidx3.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %in, i64 0, i32 0, i64 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %arrayidx3.3 = getelementptr inbounds %struct.St4x4, %struct.St4x4* %ret, i64 0, i32 0, i64 3 + store i32 %4, i32* %arrayidx3.3, align 4 + ret void +} + +define dso_local void @foo_St4x5(%struct.St4x5* nocapture noundef readonly byval(%struct.St4x5) align 4 %in, %struct.St4x5* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func foo_St4x5( + ; CHECK: .param .align 4 .b8 foo_St4x5_param_0[20], + ; CHECK: .param .b32 foo_St4x5_param_1 + ; CHECK: ) + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x5_param_1]; + ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x5_param_0]; + ; CHECK: st.u32 [[[R1]]], [[R2]]; + ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x5_param_0+4]; + ; CHECK: st.u32 [[[R1]]+4], [[R3]]; + ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x5_param_0+8]; + ; CHECK: st.u32 [[[R1]]+8], [[R4]]; + ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x5_param_0+12]; + ; CHECK: st.u32 [[[R1]]+12], [[R5]]; + ; CHECK: ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x5_param_0+16]; + ; CHECK: st.u32 [[[R1]]+16], [[R6]]; + ; CHECK: ret; + %arrayidx = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 0 + store i32 %1, i32* %arrayidx3, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx3.1 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 1 + store i32 %2, i32* %arrayidx3.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx3.2 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 2 + store i32 %3, i32* %arrayidx3.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %arrayidx3.3 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 3 + store i32 %4, i32* %arrayidx3.3, align 4 + %arrayidx.4 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %in, i64 0, i32 0, i64 4 + %5 = load i32, i32* %arrayidx.4, align 4 + %arrayidx3.4 = getelementptr inbounds %struct.St4x5, %struct.St4x5* %ret, i64 0, i32 0, i64 4 + store i32 %5, i32* %arrayidx3.4, align 4 + ret void +} + +define dso_local void @foo_St4x6(%struct.St4x6* nocapture noundef readonly byval(%struct.St4x6) align 4 %in, %struct.St4x6* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func foo_St4x6( + ; CHECK: .param .align 4 .b8 foo_St4x6_param_0[24], + ; CHECK: .param .b32 foo_St4x6_param_1 + ; CHECK: ) + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x6_param_1]; + ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x6_param_0]; + ; CHECK: st.u32 [[[R1]]], [[R2]]; + ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x6_param_0+4]; + ; CHECK: st.u32 [[[R1]]+4], [[R3]]; + ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x6_param_0+8]; + ; CHECK: st.u32 [[[R1]]+8], [[R4]]; + ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x6_param_0+12]; + ; CHECK: st.u32 [[[R1]]+12], [[R5]]; + ; CHECK: ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x6_param_0+16]; + ; CHECK: st.u32 [[[R1]]+16], [[R6]]; + ; CHECK: ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x6_param_0+20]; + ; CHECK: st.u32 [[[R1]]+20], [[R7]]; + ; CHECK: ret; + %arrayidx = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 0 + store i32 %1, i32* %arrayidx3, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx3.1 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 1 + store i32 %2, i32* %arrayidx3.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx3.2 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 2 + store i32 %3, i32* %arrayidx3.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %arrayidx3.3 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 3 + store i32 %4, i32* %arrayidx3.3, align 4 + %arrayidx.4 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 4 + %5 = load i32, i32* %arrayidx.4, align 4 + %arrayidx3.4 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 4 + store i32 %5, i32* %arrayidx3.4, align 4 + %arrayidx.5 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %in, i64 0, i32 0, i64 5 + %6 = load i32, i32* %arrayidx.5, align 4 + %arrayidx3.5 = getelementptr inbounds %struct.St4x6, %struct.St4x6* %ret, i64 0, i32 0, i64 5 + store i32 %6, i32* %arrayidx3.5, align 4 + ret void +} + +define dso_local void @foo_St4x7(%struct.St4x7* nocapture noundef readonly byval(%struct.St4x7) align 4 %in, %struct.St4x7* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func foo_St4x7( + ; CHECK: .param .align 4 .b8 foo_St4x7_param_0[28], + ; CHECK: .param .b32 foo_St4x7_param_1 + ; CHECK: ) + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x7_param_1]; + ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x7_param_0]; + ; CHECK: st.u32 [[[R1]]], [[R2]]; + ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x7_param_0+4]; + ; CHECK: st.u32 [[[R1]]+4], [[R3]]; + ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x7_param_0+8]; + ; CHECK: st.u32 [[[R1]]+8], [[R4]]; + ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x7_param_0+12]; + ; CHECK: st.u32 [[[R1]]+12], [[R5]]; + ; CHECK: ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x7_param_0+16]; + ; CHECK: st.u32 [[[R1]]+16], [[R6]]; + ; CHECK: ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x7_param_0+20]; + ; CHECK: st.u32 [[[R1]]+20], [[R7]]; + ; CHECK: ld.param.u32 [[R8:%r[0-9]+]], [foo_St4x7_param_0+24]; + ; CHECK: st.u32 [[[R1]]+24], [[R8]]; + ; CHECK: ret; + %arrayidx = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 0 + store i32 %1, i32* %arrayidx3, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx3.1 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 1 + store i32 %2, i32* %arrayidx3.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx3.2 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 2 + store i32 %3, i32* %arrayidx3.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %arrayidx3.3 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 3 + store i32 %4, i32* %arrayidx3.3, align 4 + %arrayidx.4 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 4 + %5 = load i32, i32* %arrayidx.4, align 4 + %arrayidx3.4 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 4 + store i32 %5, i32* %arrayidx3.4, align 4 + %arrayidx.5 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 5 + %6 = load i32, i32* %arrayidx.5, align 4 + %arrayidx3.5 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 5 + store i32 %6, i32* %arrayidx3.5, align 4 + %arrayidx.6 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %in, i64 0, i32 0, i64 6 + %7 = load i32, i32* %arrayidx.6, align 4 + %arrayidx3.6 = getelementptr inbounds %struct.St4x7, %struct.St4x7* %ret, i64 0, i32 0, i64 6 + store i32 %7, i32* %arrayidx3.6, align 4 + ret void +} + +define dso_local void @foo_St4x8(%struct.St4x8* nocapture noundef readonly byval(%struct.St4x8) align 4 %in, %struct.St4x8* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func foo_St4x8( + ; CHECK: .param .align 4 .b8 foo_St4x8_param_0[32], + ; CHECK: .param .b32 foo_St4x8_param_1 + ; CHECK: ) + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St4x8_param_1]; + ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x8_param_0]; + ; CHECK: st.u32 [[[R1]]], [[R2]]; + ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x8_param_0+4]; + ; CHECK: st.u32 [[[R1]]+4], [[R3]]; + ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x8_param_0+8]; + ; CHECK: st.u32 [[[R1]]+8], [[R4]]; + ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x8_param_0+12]; + ; CHECK: st.u32 [[[R1]]+12], [[R5]]; + ; CHECK: ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x8_param_0+16]; + ; CHECK: st.u32 [[[R1]]+16], [[R6]]; + ; CHECK: ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x8_param_0+20]; + ; CHECK: st.u32 [[[R1]]+20], [[R7]]; + ; CHECK: ld.param.u32 [[R8:%r[0-9]+]], [foo_St4x8_param_0+24]; + ; CHECK: st.u32 [[[R1]]+24], [[R8]]; + ; CHECK: ld.param.u32 [[R9:%r[0-9]+]], [foo_St4x8_param_0+28]; + ; CHECK: st.u32 [[[R1]]+28], [[R9]]; + ; CHECK: ret; + %arrayidx = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 0 + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 0 + store i32 %1, i32* %arrayidx3, align 4 + %arrayidx.1 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx3.1 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 1 + store i32 %2, i32* %arrayidx3.1, align 4 + %arrayidx.2 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 2 + %3 = load i32, i32* %arrayidx.2, align 4 + %arrayidx3.2 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 2 + store i32 %3, i32* %arrayidx3.2, align 4 + %arrayidx.3 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 3 + %4 = load i32, i32* %arrayidx.3, align 4 + %arrayidx3.3 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 3 + store i32 %4, i32* %arrayidx3.3, align 4 + %arrayidx.4 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 4 + %5 = load i32, i32* %arrayidx.4, align 4 + %arrayidx3.4 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 4 + store i32 %5, i32* %arrayidx3.4, align 4 + %arrayidx.5 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 5 + %6 = load i32, i32* %arrayidx.5, align 4 + %arrayidx3.5 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 5 + store i32 %6, i32* %arrayidx3.5, align 4 + %arrayidx.6 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 6 + %7 = load i32, i32* %arrayidx.6, align 4 + %arrayidx3.6 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 6 + store i32 %7, i32* %arrayidx3.6, align 4 + %arrayidx.7 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %in, i64 0, i32 0, i64 7 + %8 = load i32, i32* %arrayidx.7, align 4 + %arrayidx3.7 = getelementptr inbounds %struct.St4x8, %struct.St4x8* %ret, i64 0, i32 0, i64 7 + store i32 %8, i32* %arrayidx3.7, align 4 + ret void +} + +define dso_local void @foo_St8x1(%struct.St8x1* nocapture noundef readonly byval(%struct.St8x1) align 8 %in, %struct.St8x1* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func foo_St8x1( + ; CHECK: .param .align 8 .b8 foo_St8x1_param_0[8], + ; CHECK: .param .b32 foo_St8x1_param_1 + ; CHECK: ) + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x1_param_1]; + ; CHECK: ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x1_param_0]; + ; CHECK: st.u64 [[[R1]]], [[RD1]]; + ; CHECK: ret; + %arrayidx = getelementptr inbounds %struct.St8x1, %struct.St8x1* %in, i64 0, i32 0, i64 0 + %1 = load i64, i64* %arrayidx, align 8 + %arrayidx3 = getelementptr inbounds %struct.St8x1, %struct.St8x1* %ret, i64 0, i32 0, i64 0 + store i64 %1, i64* %arrayidx3, align 8 + ret void +} + +define dso_local void @foo_St8x2(%struct.St8x2* nocapture noundef readonly byval(%struct.St8x2) align 8 %in, %struct.St8x2* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func foo_St8x2( + ; CHECK: .param .align 8 .b8 foo_St8x2_param_0[16], + ; CHECK: .param .b32 foo_St8x2_param_1 + ; CHECK: ) + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x2_param_1]; + ; CHECK: ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x2_param_0]; + ; CHECK: st.u64 [[[R1]]], [[RD1]]; + ; CHECK: ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x2_param_0+8]; + ; CHECK: st.u64 [[[R1]]+8], [[RD2]]; + ; CHECK: ret; + %arrayidx = getelementptr inbounds %struct.St8x2, %struct.St8x2* %in, i64 0, i32 0, i64 0 + %1 = load i64, i64* %arrayidx, align 8 + %arrayidx3 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %ret, i64 0, i32 0, i64 0 + store i64 %1, i64* %arrayidx3, align 8 + %arrayidx.1 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %in, i64 0, i32 0, i64 1 + %2 = load i64, i64* %arrayidx.1, align 8 + %arrayidx3.1 = getelementptr inbounds %struct.St8x2, %struct.St8x2* %ret, i64 0, i32 0, i64 1 + store i64 %2, i64* %arrayidx3.1, align 8 + ret void +} + +define dso_local void @foo_St8x3(%struct.St8x3* nocapture noundef readonly byval(%struct.St8x3) align 8 %in, %struct.St8x3* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func foo_St8x3( + ; CHECK: .param .align 8 .b8 foo_St8x3_param_0[24], + ; CHECK: .param .b32 foo_St8x3_param_1 + ; CHECK: ) + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x3_param_1]; + ; CHECK: ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x3_param_0]; + ; CHECK: st.u64 [[[R1]]], [[RD1]]; + ; CHECK: ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x3_param_0+8]; + ; CHECK: st.u64 [[[R1]]+8], [[RD2]]; + ; CHECK: ld.param.u64 [[RD3:%rd[0-9]+]], [foo_St8x3_param_0+16]; + ; CHECK: st.u64 [[[R1]]+16], [[RD3]]; + ; CHECK: ret; + %arrayidx = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 0 + %1 = load i64, i64* %arrayidx, align 8 + %arrayidx3 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 0 + store i64 %1, i64* %arrayidx3, align 8 + %arrayidx.1 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 1 + %2 = load i64, i64* %arrayidx.1, align 8 + %arrayidx3.1 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 1 + store i64 %2, i64* %arrayidx3.1, align 8 + %arrayidx.2 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %in, i64 0, i32 0, i64 2 + %3 = load i64, i64* %arrayidx.2, align 8 + %arrayidx3.2 = getelementptr inbounds %struct.St8x3, %struct.St8x3* %ret, i64 0, i32 0, i64 2 + store i64 %3, i64* %arrayidx3.2, align 8 + ret void +} + +define dso_local void @foo_St8x4(%struct.St8x4* nocapture noundef readonly byval(%struct.St8x4) align 8 %in, %struct.St8x4* nocapture noundef writeonly %ret) { + ; CHECK-LABEL: .visible .func foo_St8x4( + ; CHECK: .param .align 8 .b8 foo_St8x4_param_0[32], + ; CHECK: .param .b32 foo_St8x4_param_1 + ; CHECK: ) + ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [foo_St8x4_param_1]; + ; CHECK: ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x4_param_0]; + ; CHECK: st.u64 [[[R1]]], [[RD1]]; + ; CHECK: ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x4_param_0+8]; + ; CHECK: st.u64 [[[R1]]+8], [[RD2]]; + ; CHECK: ld.param.u64 [[RD3:%rd[0-9]+]], [foo_St8x4_param_0+16]; + ; CHECK: st.u64 [[[R1]]+16], [[RD3]]; + ; CHECK: ld.param.u64 [[RD4:%rd[0-9]+]], [foo_St8x4_param_0+24]; + ; CHECK: st.u64 [[[R1]]+24], [[RD4]]; + ; CHECK: ret; + %arrayidx = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 0 + %1 = load i64, i64* %arrayidx, align 8 + %arrayidx3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 0 + store i64 %1, i64* %arrayidx3, align 8 + %arrayidx.1 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 1 + %2 = load i64, i64* %arrayidx.1, align 8 + %arrayidx3.1 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 1 + store i64 %2, i64* %arrayidx3.1, align 8 + %arrayidx.2 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 2 + %3 = load i64, i64* %arrayidx.2, align 8 + %arrayidx3.2 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 2 + store i64 %3, i64* %arrayidx3.2, align 8 + %arrayidx.3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %in, i64 0, i32 0, i64 3 + %4 = load i64, i64* %arrayidx.3, align 8 + %arrayidx3.3 = getelementptr inbounds %struct.St8x4, %struct.St8x4* %ret, i64 0, i32 0, i64 3 + store i64 %4, i64* %arrayidx3.3, align 8 + ret void +}