diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -950,6 +950,12 @@ /// Use -1 to indicate that there is no information on the index value. int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index = -1) const; + /// \return The expected cost of continous block of iether vector Insert or + /// Extract instructions as specified by \p Opcode. + /// \p Indices are index values for all instructions in the seqence. + int getVectorInstrChainCost(unsigned Opcode, Type *Val, + ArrayRef Indices) const; + /// \return The cost of Load and Store instructions. int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, @@ -1356,6 +1362,8 @@ Type *CondTy, const Instruction *I) = 0; virtual int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) = 0; + virtual int getVectorInstrChainCost(unsigned Opcode, Type *Val, + ArrayRef Indices) = 0; virtual int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, const Instruction *I) = 0; virtual int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, @@ -1779,6 +1787,10 @@ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) override { return Impl.getVectorInstrCost(Opcode, Val, Index); } + int getVectorInstrChainCost(unsigned Opcode, Type *Val, + ArrayRef Indices) override { + return Impl.getVectorInstrChainCost(Opcode, Val, Indices); + } int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, const Instruction *I) override { return Impl.getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -470,6 +470,11 @@ return 1; } + unsigned getVectorInstrChainCost(unsigned Opcode, Type *Val, + ArrayRef Indices) { + return Indices.size(); + } + unsigned getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, const Instruction *I) { return 1; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -652,6 +652,13 @@ return Cost; } +int TargetTransformInfo::getVectorInstrChainCost( + unsigned Opcode, Type *Val, ArrayRef Indices) const { + int Cost = TTIImpl->getVectorInstrChainCost(Opcode, Val, Indices); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + int TargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -133,6 +133,8 @@ int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + int getVectorInstrChainCost(unsigned Opcode, Type *Val, + ArrayRef Indices); int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, const Instruction *I = nullptr); int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2432,7 +2432,8 @@ return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF, I); } -int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { +int X86TTIImpl::getVectorInstrChainCost( + unsigned Opcode, Type *Val, ArrayRef Indices) { static const CostTblEntry SLMCostTbl[] = { { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 }, { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 }, @@ -2441,33 +2442,47 @@ }; assert(Val->isVectorTy() && "This must be a vector type"); + + if (Opcode != Instruction::ExtractElement && + Opcode != Instruction::InsertElement) + return BaseT::getVectorInstrChainCost(Opcode, Val, Indices); + Type *ScalarType = Val->getScalarType(); - int RegisterFileMoveCost = 0; + int TotalCost = 0; + // Legalize the type. + std::pair LT = TLI->getTypeLegalizationCost(DL, Val); - if (Index != -1U && (Opcode == Instruction::ExtractElement || - Opcode == Instruction::InsertElement)) { - // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Val); + // This type is legalized to a scalar type. + if (!LT.second.isVector()) + return 0; - // This type is legalized to a scalar type. - if (!LT.second.isVector()) - return 0; + unsigned NumElts = LT.second.getVectorNumElements(); + unsigned SubNumElts = NumElts; + unsigned NumSubVecs = 1; + // For >128-bit vectors, we need to extract higher 128-bit subvectors. + // For inserts, we also need to insert the subvector back. + if (LT.second.getSizeInBits() > 128) { + assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector"); + NumSubVecs = LT.second.getSizeInBits() / 128; + SubNumElts = NumElts / NumSubVecs; + } + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Unexpected vector opcode"); + MVT MScalarTy = LT.second.getScalarType(); + auto *SLMEntry = + ST->isSLM() ? CostTableLookup(SLMCostTbl, ISD, MScalarTy) : nullptr; + SmallSet SubVectors; + for (auto I : Indices) { // The type may be split. Normalize the index to the new type. - unsigned NumElts = LT.second.getVectorNumElements(); - unsigned SubNumElts = NumElts; - Index = Index % NumElts; - - // For >128-bit vectors, we need to extract higher 128-bit subvectors. - // For inserts, we also need to insert the subvector back. - if (LT.second.getSizeInBits() > 128) { - assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector"); - unsigned NumSubVecs = LT.second.getSizeInBits() / 128; - SubNumElts = NumElts / NumSubVecs; - if (SubNumElts <= Index) { - RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1); - Index %= SubNumElts; - } + unsigned Index = I % NumElts; + + // For any upper 128-bit sub-vectors enter its index into set. + // Even if we extract/insert into same sub-vector multiple times we actually + // only need to extract/insert that sub-vector back only once per group. + if (NumSubVecs > 1 && SubNumElts <= Index) { + SubVectors.insert(Index / SubNumElts); + Index %= SubNumElts; } if (Index == 0) { @@ -2475,29 +2490,33 @@ // Many insertions to #0 can fold away for scalar fp-ops, so let's assume // true for all. if (ScalarType->isFloatingPointTy()) - return RegisterFileMoveCost; + continue; // Assume movd/movq XMM -> GPR is relatively cheap on all targets. - if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement) - return 1 + RegisterFileMoveCost; + if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement) { + TotalCost += 1; + continue; + } } - int ISD = TLI->InstructionOpcodeToISD(Opcode); - assert(ISD && "Unexpected vector opcode"); - MVT MScalarTy = LT.second.getScalarType(); - if (ST->isSLM()) - if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy)) - return Entry->Cost + RegisterFileMoveCost; + if (SLMEntry) { + TotalCost += SLMEntry->Cost; + continue; + } // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || - (MScalarTy.isInteger() && ST->hasSSE41())) - return 1 + RegisterFileMoveCost; + (MScalarTy.isInteger() && ST->hasSSE41())) { + TotalCost += 1; + continue; + } // Assume insertps is relatively cheap on all targets. if (MScalarTy == MVT::f32 && ST->hasSSE41() && - Opcode == Instruction::InsertElement) - return 1 + RegisterFileMoveCost; + Opcode == Instruction::InsertElement) { + TotalCost += 1; + continue; + } // For extractions we just need to shuffle the element to index 0, which // should be very cheap (assume cost = 1). For insertions we need to shuffle @@ -2510,13 +2529,30 @@ ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy); } int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; - return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; + TotalCost += ShuffleCost + IntOrFpCost; } + // Now take into account cost of subvector extractions + // and (if required) insertions. + TotalCost += + SubVectors.size() * (Opcode == Instruction::InsertElement ? 2 : 1); + return TotalCost; +} + +int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { + + assert(Val->isVectorTy() && "This must be a vector type"); + + if (Index != -1U && (Opcode == Instruction::ExtractElement || + Opcode == Instruction::InsertElement)) + return getVectorInstrChainCost(Opcode, Val, Index); + + int RegisterFileMoveCost = 0; // Add to the base cost if we know that the extracted element of a vector is // destined to be moved to and used in the integer register file. - if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) - RegisterFileMoveCost += 1; + if (Opcode == Instruction::ExtractElement && + Val->getScalarType()->isPointerTy()) + RegisterFileMoveCost = 1; return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3870,10 +3870,12 @@ int BoUpSLP::getGatherCost(Type *Ty, const DenseSet &ShuffledIndices) const { - int Cost = 0; - for (unsigned i = 0, e = cast(Ty)->getNumElements(); i < e; ++i) - if (!ShuffledIndices.count(i)) - Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + SmallVector GatherIndices; + for (unsigned I = 0, E = cast(Ty)->getNumElements(); I < E; ++I) + if (!ShuffledIndices.count(I)) + GatherIndices.push_back(I); + int Cost = TTI->getVectorInstrChainCost(Instruction::InsertElement, Ty, + GatherIndices); if (!ShuffledIndices.empty()) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); return Cost; @@ -7020,6 +7022,35 @@ InsertedOperand = IE->getOperand(1); LastInsertInst = IE->getOperand(0); if (auto *CI = dyn_cast(IE->getOperand(2))) { + // TODO: Use TTI interface for sequence of inserts rather than sum of + // single inserts as the latter may overestimate cost. + // This work should imply improving cost estimation for extracts that + // added in for external (for vectorization tree) users. + // For example, in following case all extracts added in order to feed + // into external users (inserts), which in turn form sequence to build + // an aggregate that we do match here: + // %4 = extractelement <4 x i64> %3, i32 0 + // %v0 = insertelement <4 x i64> undef, i64 %4, i32 0 + // %5 = extractelement <4 x i64> %3, i32 1 + // %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1 + // %6 = extractelement <4 x i64> %3, i32 2 + // %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2 + // %7 = extractelement <4 x i64> %3, i32 3 + // %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3 + // + // Cost of this entire sequence is currently estimated as sum of single + // extracts (as this aggregate build sequence is an external to + // vectorization tree user) minus cost of the aggregate build. + // As this whole sequence will be optimized away we want the cost to be + // zero. But it is not quite possible using given approach (at least for + // X86) because inserts can be more expensive than extracts for longer + // vector lengths so the difference turns out not zero in such a case. + // Ideally we want to match this entire sequence and treat it as a no-op + // (i.e. do not count into final cost at all). + // Currently the difference tends to be negative thus adding a bias + // toward favoring vectorization. If we switch into using TTI interface + // for insert/extract chains (assuming same improvement done for + // external users cost) the bias tendency will remain but will be lower. UserCost += TTI->getVectorInstrCost(Instruction::InsertElement, IE->getType(), CI->getZExtValue()); } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll @@ -12,84 +12,70 @@ ; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1 ; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SUB_I]] to i8 -; CHECK-NEXT: [[CONV_I_I1199:%.*]] = and i8 [[TMP1]], 1 -; CHECK-NEXT: store i8 [[CONV_I_I1199]], i8* [[TMP0]], align 1 -; CHECK-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SHR_I_I]] to i8 -; CHECK-NEXT: [[CONV_1_I_I:%.*]] = and i8 [[TMP2]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_1_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1 -; CHECK-NEXT: store i8 [[CONV_1_I_I]], i8* [[ARRAYIDX_I_I7_1_I_I]], align 1 -; CHECK-NEXT: [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[SHR_1_I_I]] to i8 -; CHECK-NEXT: [[CONV_2_I_I:%.*]] = and i8 [[TMP3]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_2_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2 -; CHECK-NEXT: store i8 [[CONV_2_I_I]], i8* [[ARRAYIDX_I_I7_2_I_I]], align 1 -; CHECK-NEXT: [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3 -; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[SHR_2_I_I]] to i8 -; CHECK-NEXT: [[CONV_3_I_I:%.*]] = and i8 [[TMP4]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_3_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 3 -; CHECK-NEXT: store i8 [[CONV_3_I_I]], i8* [[ARRAYIDX_I_I7_3_I_I]], align 1 -; CHECK-NEXT: [[SHR_3_I_I:%.*]] = lshr i32 [[CONV31_I]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[SHR_3_I_I]] to i8 -; CHECK-NEXT: [[CONV_4_I_I:%.*]] = and i8 [[TMP5]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_4_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 4 -; CHECK-NEXT: store i8 [[CONV_4_I_I]], i8* [[ARRAYIDX_I_I7_4_I_I]], align 1 -; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[SHR_4_I_I]] to i8 -; CHECK-NEXT: [[CONV_5_I_I:%.*]] = and i8 [[TMP6]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5 -; CHECK-NEXT: store i8 [[CONV_5_I_I]], i8* [[ARRAYIDX_I_I7_5_I_I]], align 1 -; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6 -; CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[SHR_5_I_I]] to i8 -; CHECK-NEXT: [[CONV_6_I_I:%.*]] = and i8 [[TMP7]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6 -; CHECK-NEXT: store i8 [[CONV_6_I_I]], i8* [[ARRAYIDX_I_I7_6_I_I]], align 1 -; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7 -; CHECK-NEXT: [[TMP8:%.*]] = trunc i32 [[SHR_6_I_I]] to i8 -; CHECK-NEXT: [[CONV_7_I_I:%.*]] = and i8 [[TMP8]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7 -; CHECK-NEXT: store i8 [[CONV_7_I_I]], i8* [[ARRAYIDX_I_I7_7_I_I]], align 1 -; CHECK-NEXT: [[SHR_7_I_I:%.*]] = lshr i32 [[CONV31_I]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[SHR_7_I_I]] to i8 -; CHECK-NEXT: [[CONV_8_I_I:%.*]] = and i8 [[TMP9]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CONV31_I]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CONV31_I]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CONV31_I]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CONV31_I]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = lshr <8 x i32> [[TMP8]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8 -; CHECK-NEXT: store i8 [[CONV_8_I_I]], i8* [[ARRAYIDX_I_I7_8_I_I]], align 1 -; CHECK-NEXT: [[SHR_8_I_I:%.*]] = lshr i32 [[CONV31_I]], 9 -; CHECK-NEXT: [[TMP10:%.*]] = trunc i32 [[SHR_8_I_I]] to i8 -; CHECK-NEXT: [[CONV_9_I_I:%.*]] = and i8 [[TMP10]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9 -; CHECK-NEXT: store i8 [[CONV_9_I_I]], i8* [[ARRAYIDX_I_I7_9_I_I]], align 1 -; CHECK-NEXT: [[SHR_9_I_I:%.*]] = lshr i32 [[CONV31_I]], 10 -; CHECK-NEXT: [[TMP11:%.*]] = trunc i32 [[SHR_9_I_I]] to i8 -; CHECK-NEXT: [[CONV_10_I_I:%.*]] = and i8 [[TMP11]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10 -; CHECK-NEXT: store i8 [[CONV_10_I_I]], i8* [[ARRAYIDX_I_I7_10_I_I]], align 1 -; CHECK-NEXT: [[SHR_10_I_I:%.*]] = lshr i32 [[CONV31_I]], 11 -; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[SHR_10_I_I]] to i8 -; CHECK-NEXT: [[CONV_11_I_I:%.*]] = and i8 [[TMP12]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11 -; CHECK-NEXT: store i8 [[CONV_11_I_I]], i8* [[ARRAYIDX_I_I7_11_I_I]], align 1 -; CHECK-NEXT: [[SHR_11_I_I:%.*]] = lshr i32 [[CONV31_I]], 12 -; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[SHR_11_I_I]] to i8 -; CHECK-NEXT: [[CONV_12_I_I:%.*]] = and i8 [[TMP13]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV31_I]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CONV31_I]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3 +; CHECK-NEXT: [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12 -; CHECK-NEXT: store i8 [[CONV_12_I_I]], i8* [[ARRAYIDX_I_I7_12_I_I]], align 1 ; CHECK-NEXT: [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13 -; CHECK-NEXT: [[TMP14:%.*]] = trunc i32 [[SHR_12_I_I]] to i8 -; CHECK-NEXT: [[CONV_13_I_I:%.*]] = and i8 [[TMP14]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13 -; CHECK-NEXT: store i8 [[CONV_13_I_I]], i8* [[ARRAYIDX_I_I7_13_I_I]], align 1 ; CHECK-NEXT: [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14 -; CHECK-NEXT: [[TMP15:%.*]] = trunc i32 [[SHR_13_I_I]] to i8 -; CHECK-NEXT: [[CONV_14_I_I:%.*]] = and i8 [[TMP15]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14 -; CHECK-NEXT: store i8 [[CONV_14_I_I]], i8* [[ARRAYIDX_I_I7_14_I_I]], align 1 ; CHECK-NEXT: [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = trunc i32 [[SHR_14_I_I]] to i8 -; CHECK-NEXT: [[CONV_15_I_I:%.*]] = and i8 [[TMP16]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10 +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11 +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12 +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13 +; CHECK-NEXT: [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14 +; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15 +; CHECK-NEXT: [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8> +; CHECK-NEXT: [[TMP44:%.*]] = and <16 x i8> [[TMP43]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15 -; CHECK-NEXT: store i8 [[CONV_15_I_I]], i8* [[ARRAYIDX_I_I7_15_I_I]], align 1 +; CHECK-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1 ; CHECK-NEXT: unreachable ; CHECK: if.end50.i: ; CHECK-NEXT: ret void