diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -599,7 +599,11 @@ /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - InstructionCost getTreeCost(); + /// \param InsertUses List of the insertelements that are the origin of the + /// vectorization tree. Used to track scalar instructions and their idx and + /// insertelement instruction itself to correctly calculate the cost of the + /// resulting vector instructions (shuffled/perfect vector building). + InstructionCost getTreeCost(ArrayRef InsertUses = None); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. @@ -4223,7 +4227,7 @@ return Cost; } -InstructionCost BoUpSLP::getTreeCost() { +InstructionCost BoUpSLP::getTreeCost(ArrayRef InsertUses) { InstructionCost Cost = 0; LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); @@ -4262,8 +4266,29 @@ << "SLP: Current total cost = " << Cost << "\n"); } + // Track scalar instructions and their idx and insertelement instruction + // itself. + DenseMap> ValsToInserts; + int VF = 0; + for (Value *V : InsertUses) { + auto *I = cast(V); + int Idx = UndefMaskElem; + if (auto *CI = dyn_cast(I->getOperand(2))) { + if (auto *FTy = dyn_cast(I->getOperand(0)->getType())) { + Idx = CI->getSExtValue(); + assert((VF == 0 || VF == static_cast(FTy->getNumElements())) && + "Expected insertelements of the same types."); + VF = FTy->getNumElements(); + } + } + ValsToInserts.try_emplace(I->getOperand(1), std::make_pair(V, Idx)); + } + SmallPtrSet ExtractCostCalculated; InstructionCost ExtractCost = 0; + bool UsedShuffleMask = false; + bool IsIdentity = true; + SmallVector ShuffleMask(VF, UndefMaskElem); for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. if (!ExtractCostCalculated.insert(EU.Scalar).second) @@ -4275,6 +4300,19 @@ if (EphValues.count(EU.User)) continue; + // If found user is an insertelement, do not calculate extract cost but try + // to detect it as a final shuffled/identity match. + auto It = ValsToInserts.find(EU.Scalar); + if (It != ValsToInserts.end() && EU.User == It->second.first) { + if (It->second.second >= 0 && It->second.second < VF) + ShuffleMask[It->second.second] = EU.Lane; + IsIdentity = IsIdentity && (EU.Lane == It->second.second || + It->second.second == UndefMaskElem || + EU.Lane == UndefMaskElem); + UsedShuffleMask = true; + continue; + } + // If we plan to rewrite the tree in a smaller type, we will need to sign // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. @@ -4295,6 +4333,18 @@ InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; + if (UsedShuffleMask && !IsIdentity) { + InstructionCost C = TTI->getShuffleCost( + TTI::SK_PermuteSingleSrc, + FixedVectorType::get( + VectorizableTree.front()->Scalars.front()->getType(), VF), + ShuffleMask); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of insertelement operands " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + } #ifndef NDEBUG SmallString<256> Str; @@ -6606,47 +6656,9 @@ continue; R.computeMinimumValueSizes(); - InstructionCost Cost = R.getTreeCost(); + InstructionCost Cost = + R.getTreeCost(CompensateUseCost ? InsertUses : None); CandidateFound = true; - if (CompensateUseCost) { - // TODO: Use TTI's getScalarizationOverhead for sequence of inserts - // rather than sum of single inserts as the latter may overestimate - // cost. This work should imply improving cost estimation for extracts - // that added in for external (for vectorization tree) users,i.e. that - // part should also switch to same interface. - // For example, the following case is projected code after SLP: - // %4 = extractelement <4 x i64> %3, i32 0 - // %v0 = insertelement <4 x i64> poison, i64 %4, i32 0 - // %5 = extractelement <4 x i64> %3, i32 1 - // %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1 - // %6 = extractelement <4 x i64> %3, i32 2 - // %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2 - // %7 = extractelement <4 x i64> %3, i32 3 - // %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3 - // - // Extracts here added by SLP in order to feed users (the inserts) of - // original scalars and contribute to "ExtractCost" at cost evaluation. - // The inserts in turn form sequence to build an aggregate that - // detected by findBuildAggregate routine. - // SLP makes an assumption that such sequence will be optimized away - // later (instcombine) so it tries to compensate ExctractCost with - // cost of insert sequence. - // Current per element cost calculation approach is not quite accurate - // and tends to create bias toward favoring vectorization. - // Switching to the TTI interface might help a bit. - // Alternative solution could be pattern-match to detect a no-op or - // shuffle. - InstructionCost UserCost = 0; - for (unsigned Lane = 0; Lane < OpsWidth; Lane++) { - auto *IE = cast(InsertUses[I + Lane]); - if (auto *CI = dyn_cast(IE->getOperand(2))) - UserCost += TTI->getVectorInstrCost( - Instruction::InsertElement, IE->getType(), CI->getZExtValue()); - } - LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost - << ".\n"); - Cost -= UserCost; - } MinCost = std::min(MinCost, Cost); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -145,21 +145,22 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i32 0 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 +; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 +; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]] ; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]] +; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]] ; CHECK-NEXT: [[TMP2_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1_0]], i32 0 ; CHECK-NEXT: [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1 -; CHECK-NEXT: [[TMP2_3:%.*]] = shufflevector <4 x i32> [[TMP2_1]], <4 x i32> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2 +; CHECK-NEXT: [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3 ; CHECK-NEXT: ret <4 x i32> [[TMP2_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -189,19 +190,20 @@ ; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 ; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] ; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]] ; CHECK-NEXT: [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP1_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP1_3:%.*]] = xor i32 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] ; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP2_2:%.*]] = add i32 [[TMP0_2]], [[TMP0_3]] +; CHECK-NEXT: [[TMP2_3:%.*]] = add i32 [[TMP1_2]], [[TMP1_3]] ; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2_0]], i32 0 ; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 -; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP3_2:%.*]] = insertelement <4 x i32> [[TMP3_1]], i32 [[TMP2_2]], i32 2 +; CHECK-NEXT: [[TMP3_3:%.*]] = insertelement <4 x i32> [[TMP3_2]], i32 [[TMP2_3]], i32 3 ; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -145,21 +145,22 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i32 0 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 +; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 +; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]] ; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]] +; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]] ; CHECK-NEXT: [[TMP2_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_0]], i32 0 ; CHECK-NEXT: [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1 -; CHECK-NEXT: [[TMP2_3:%.*]] = shufflevector <4 x i32> [[TMP2_1]], <4 x i32> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2 +; CHECK-NEXT: [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3 ; CHECK-NEXT: ret <4 x i32> [[TMP2_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -189,19 +190,20 @@ ; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 ; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] ; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]] ; CHECK-NEXT: [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP1_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP1_3:%.*]] = xor i32 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] ; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP2_2:%.*]] = add i32 [[TMP0_2]], [[TMP0_3]] +; CHECK-NEXT: [[TMP2_3:%.*]] = add i32 [[TMP1_2]], [[TMP1_3]] ; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0 ; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 -; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP3_2:%.*]] = insertelement <4 x i32> [[TMP3_1]], i32 [[TMP2_2]], i32 2 +; CHECK-NEXT: [[TMP3_3:%.*]] = insertelement <4 x i32> [[TMP3_2]], i32 [[TMP2_3]], i32 3 ; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll @@ -5,15 +5,13 @@ ; CHECK-LABEL: @_Z10fooConvertPDv4_xS0_S0_PKS_( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x half> undef, i32 4 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <16 x half> undef, i32 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x half> poison, half [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fpext <2 x half> [[TMP3]] to <2 x float> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 -; CHECK-NEXT: [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 -; CHECK-NEXT: [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP7]], i32 5 +; CHECK-NEXT: [[CONV_I_4_I:%.*]] = fpext half [[TMP0]] to float +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[CONV_I_4_I]] to i32 +; CHECK-NEXT: [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i32 4 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x half> undef, i32 5 +; CHECK-NEXT: [[CONV_I_5_I:%.*]] = fpext half [[TMP2]] to float +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[CONV_I_5_I]] to i32 +; CHECK-NEXT: [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP3]], i32 5 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll @@ -5,15 +5,13 @@ ; CHECK-LABEL: @_Z10fooConvertPDv4_xS0_S0_PKS_( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x half> undef, i32 4 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <16 x half> undef, i32 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x half> poison, half [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fpext <2 x half> [[TMP3]] to <2 x float> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 -; CHECK-NEXT: [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 -; CHECK-NEXT: [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP7]], i32 5 +; CHECK-NEXT: [[CONV_I_4_I:%.*]] = fpext half [[TMP0]] to float +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[CONV_I_4_I]] to i32 +; CHECK-NEXT: [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i32 4 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x half> undef, i32 5 +; CHECK-NEXT: [[CONV_I_5_I:%.*]] = fpext half [[TMP2]] to float +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[CONV_I_5_I]] to i32 +; CHECK-NEXT: [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP3]], i32 5 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX1 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 @@ -107,25 +107,31 @@ ; SSE-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> ; SSE-NEXT: ret <8 x i32> [[R7]] ; +; SLM-LABEL: @ashr_shl_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] +; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[R7]] +; ; AVX1-LABEL: @ashr_shl_v8i32( ; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 ; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 +; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 ; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] -; AVX1-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <8 x i32> -; AVX1-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]] -; AVX1-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <8 x i32> -; AVX1-NEXT: [[TMP7:%.*]] = shl <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] +; AVX1-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] +; AVX1-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] ; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0 ; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX1-NEXT: [[R3:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP4]], <8 x i32> -; AVX1-NEXT: [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP6]], <8 x i32> -; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP7]], <8 x i32> +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> ; AVX1-NEXT: ret <8 x i32> [[R7]] ; ; AVX2-LABEL: @ashr_shl_v8i32( @@ -184,6 +190,14 @@ ; SSE-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; SSE-NEXT: ret <8 x i32> [[R7]] ; +; SLM-LABEL: @ashr_shl_v8i32_const( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], +; SLM-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[R7]] +; ; AVX1-LABEL: @ashr_shl_v8i32_const( ; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> ; AVX1-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], @@ -235,55 +249,100 @@ ; SSE-LABEL: @ashr_lshr_shl_v8i32( ; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; SSE-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; SSE-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 +; SSE-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 +; SSE-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 +; SSE-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 ; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 ; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; SSE-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; SSE-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; SSE-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]] +; SSE-NEXT: [[AB2:%.*]] = lshr i32 [[A2]], [[B2]] +; SSE-NEXT: [[AB3:%.*]] = lshr i32 [[A3]], [[B3]] +; SSE-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] +; SSE-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] ; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0 ; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SSE-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2 -; SSE-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3 -; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 -; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4 -; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; SSE-NEXT: ret <8 x i32> [[R7]] ; +; SLM-LABEL: @ashr_lshr_shl_v8i32( +; SLM-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 4 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SLM-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 4 +; SLM-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 +; SLM-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 +; SLM-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] +; SLM-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] +; SLM-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] +; SLM-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] +; SLM-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] +; SLM-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i32 0 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP6]], i32 1 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP7]], i32 2 +; SLM-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP8]], i32 3 +; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 +; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SLM-NEXT: ret <8 x i32> [[R7]] +; ; AVX1-LABEL: @ashr_lshr_shl_v8i32( ; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; AVX1-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 +; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 +; AVX1-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 +; AVX1-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 ; AVX1-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 ; AVX1-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; AVX1-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[AB2:%.*]] = lshr i32 [[A2]], [[B2]] +; AVX1-NEXT: [[AB3:%.*]] = lshr i32 [[A3]], [[B3]] +; AVX1-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] +; AVX1-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] ; AVX1-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; AVX1-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0 ; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX1-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 -; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2 -; AVX1-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 -; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 -; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 -; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; AVX1-NEXT: ret <8 x i32> [[R7]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX1 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512 @@ -107,25 +107,31 @@ ; SSE-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> ; SSE-NEXT: ret <8 x i32> [[R7]] ; +; SLM-LABEL: @ashr_shl_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] +; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] +; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[R7]] +; ; AVX1-LABEL: @ashr_shl_v8i32( ; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 ; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 +; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 ; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] -; AVX1-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <8 x i32> -; AVX1-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]] -; AVX1-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <8 x i32> -; AVX1-NEXT: [[TMP7:%.*]] = shl <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] +; AVX1-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] +; AVX1-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] ; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 ; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX1-NEXT: [[R3:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP4]], <8 x i32> -; AVX1-NEXT: [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP6]], <8 x i32> -; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP7]], <8 x i32> +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> ; AVX1-NEXT: ret <8 x i32> [[R7]] ; ; AVX2-LABEL: @ashr_shl_v8i32( @@ -184,6 +190,14 @@ ; SSE-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; SSE-NEXT: ret <8 x i32> [[R7]] ; +; SLM-LABEL: @ashr_shl_v8i32_const( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], +; SLM-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[R7]] +; ; AVX1-LABEL: @ashr_shl_v8i32_const( ; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> ; AVX1-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], @@ -235,55 +249,100 @@ ; SSE-LABEL: @ashr_lshr_shl_v8i32( ; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; SSE-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; SSE-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 +; SSE-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 +; SSE-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 +; SSE-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 ; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 ; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; SSE-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; SSE-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; SSE-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]] +; SSE-NEXT: [[AB2:%.*]] = lshr i32 [[A2]], [[B2]] +; SSE-NEXT: [[AB3:%.*]] = lshr i32 [[A3]], [[B3]] +; SSE-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] +; SSE-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] ; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 ; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SSE-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2 -; SSE-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3 -; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 -; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4 -; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; SSE-NEXT: ret <8 x i32> [[R7]] ; +; SLM-LABEL: @ashr_lshr_shl_v8i32( +; SLM-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 4 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SLM-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 4 +; SLM-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 +; SLM-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 +; SLM-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] +; SLM-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] +; SLM-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] +; SLM-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] +; SLM-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] +; SLM-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP5]], i32 0 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP6]], i32 1 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP7]], i32 2 +; SLM-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP8]], i32 3 +; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 +; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SLM-NEXT: ret <8 x i32> [[R7]] +; ; AVX1-LABEL: @ashr_lshr_shl_v8i32( ; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; AVX1-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 +; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 +; AVX1-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 +; AVX1-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 ; AVX1-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 ; AVX1-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; AVX1-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[AB2:%.*]] = lshr i32 [[A2]], [[B2]] +; AVX1-NEXT: [[AB3:%.*]] = lshr i32 [[A3]], [[B3]] +; AVX1-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] +; AVX1-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] ; AVX1-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; AVX1-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 ; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX1-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 -; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2 -; AVX1-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 -; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 -; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 -; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; AVX1-NEXT: ret <8 x i32> [[R7]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll @@ -163,13 +163,21 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @test_v4f64( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3 +; SSE-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i32 2 +; SSE-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SSE-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: [[R2:%.*]] = fadd double [[A2]], [[A3]] +; SSE-NEXT: [[R3:%.*]] = fadd double [[B2]], [[B3]] +; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; SSE-NEXT: [[R00:%.*]] = insertelement <4 x double> poison, double [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; SSE-NEXT: [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[TMP5]], i32 1 +; SSE-NEXT: [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2 +; SSE-NEXT: [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3 ; SSE-NEXT: ret <4 x double> [[R03]] ; ; SLM-LABEL: @test_v4f64( @@ -218,10 +226,14 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @test_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] -; SSE-NEXT: ret <8 x float> [[TMP3]] +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]] +; SSE-NEXT: [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[R07]] ; ; SLM-LABEL: @test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> @@ -344,13 +356,57 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SSE-NEXT: [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i32 8 +; SSE-NEXT: [[A9:%.*]] = extractelement <16 x i16> [[A]], i32 9 +; SSE-NEXT: [[A10:%.*]] = extractelement <16 x i16> [[A]], i32 10 +; SSE-NEXT: [[A11:%.*]] = extractelement <16 x i16> [[A]], i32 11 +; SSE-NEXT: [[A12:%.*]] = extractelement <16 x i16> [[A]], i32 12 +; SSE-NEXT: [[A13:%.*]] = extractelement <16 x i16> [[A]], i32 13 +; SSE-NEXT: [[A14:%.*]] = extractelement <16 x i16> [[A]], i32 14 +; SSE-NEXT: [[A15:%.*]] = extractelement <16 x i16> [[A]], i32 15 +; SSE-NEXT: [[B8:%.*]] = extractelement <16 x i16> [[B:%.*]], i32 8 +; SSE-NEXT: [[B9:%.*]] = extractelement <16 x i16> [[B]], i32 9 +; SSE-NEXT: [[B10:%.*]] = extractelement <16 x i16> [[B]], i32 10 +; SSE-NEXT: [[B11:%.*]] = extractelement <16 x i16> [[B]], i32 11 +; SSE-NEXT: [[B12:%.*]] = extractelement <16 x i16> [[B]], i32 12 +; SSE-NEXT: [[B13:%.*]] = extractelement <16 x i16> [[B]], i32 13 +; SSE-NEXT: [[B14:%.*]] = extractelement <16 x i16> [[B]], i32 14 +; SSE-NEXT: [[B15:%.*]] = extractelement <16 x i16> [[B]], i32 15 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[R8:%.*]] = add i16 [[A8]], [[A9]] +; SSE-NEXT: [[R9:%.*]] = add i16 [[A10]], [[A11]] +; SSE-NEXT: [[R10:%.*]] = add i16 [[A12]], [[A13]] +; SSE-NEXT: [[R11:%.*]] = add i16 [[A14]], [[A15]] +; SSE-NEXT: [[R12:%.*]] = add i16 [[B8]], [[B9]] +; SSE-NEXT: [[R13:%.*]] = add i16 [[B10]], [[B11]] +; SSE-NEXT: [[R14:%.*]] = add i16 [[B12]], [[B13]] +; SSE-NEXT: [[R15:%.*]] = add i16 [[B14]], [[B15]] +; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[RV0:%.*]] = insertelement <16 x i16> poison, i16 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[RV1:%.*]] = insertelement <16 x i16> [[RV0]], i16 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[RV2:%.*]] = insertelement <16 x i16> [[RV1]], i16 [[TMP6]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[RV3:%.*]] = insertelement <16 x i16> [[RV2]], i16 [[TMP7]], i32 3 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; SSE-NEXT: [[RV4:%.*]] = insertelement <16 x i16> [[RV3]], i16 [[TMP8]], i32 4 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; SSE-NEXT: [[RV5:%.*]] = insertelement <16 x i16> [[RV4]], i16 [[TMP9]], i32 5 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; SSE-NEXT: [[RV6:%.*]] = insertelement <16 x i16> [[RV5]], i16 [[TMP10]], i32 6 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; SSE-NEXT: [[RV7:%.*]] = insertelement <16 x i16> [[RV6]], i16 [[TMP11]], i32 7 +; SSE-NEXT: [[RV8:%.*]] = insertelement <16 x i16> [[RV7]], i16 [[R8]], i32 8 +; SSE-NEXT: [[RV9:%.*]] = insertelement <16 x i16> [[RV8]], i16 [[R9]], i32 9 +; SSE-NEXT: [[RV10:%.*]] = insertelement <16 x i16> [[RV9]], i16 [[R10]], i32 10 +; SSE-NEXT: [[RV11:%.*]] = insertelement <16 x i16> [[RV10]], i16 [[R11]], i32 11 +; SSE-NEXT: [[RV12:%.*]] = insertelement <16 x i16> [[RV11]], i16 [[R12]], i32 12 +; SSE-NEXT: [[RV13:%.*]] = insertelement <16 x i16> [[RV12]], i16 [[R13]], i32 13 +; SSE-NEXT: [[RV14:%.*]] = insertelement <16 x i16> [[RV13]], i16 [[R14]], i32 14 +; SSE-NEXT: [[RV15:%.*]] = insertelement <16 x i16> [[RV14]], i16 [[R15]], i32 15 ; SSE-NEXT: ret <16 x i16> [[RV15]] ; ; SLM-LABEL: @test_v16i16( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll @@ -163,13 +163,21 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @test_v4f64( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3 +; SSE-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i32 2 +; SSE-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SSE-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: [[R2:%.*]] = fadd double [[A2]], [[A3]] +; SSE-NEXT: [[R3:%.*]] = fadd double [[B2]], [[B3]] +; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; SSE-NEXT: [[R00:%.*]] = insertelement <4 x double> undef, double [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; SSE-NEXT: [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[TMP5]], i32 1 +; SSE-NEXT: [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2 +; SSE-NEXT: [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3 ; SSE-NEXT: ret <4 x double> [[R03]] ; ; SLM-LABEL: @test_v4f64( @@ -218,10 +226,14 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @test_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] -; SSE-NEXT: ret <8 x float> [[TMP3]] +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]] +; SSE-NEXT: [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[R07]] ; ; SLM-LABEL: @test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> @@ -344,13 +356,57 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SSE-NEXT: [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i32 8 +; SSE-NEXT: [[A9:%.*]] = extractelement <16 x i16> [[A]], i32 9 +; SSE-NEXT: [[A10:%.*]] = extractelement <16 x i16> [[A]], i32 10 +; SSE-NEXT: [[A11:%.*]] = extractelement <16 x i16> [[A]], i32 11 +; SSE-NEXT: [[A12:%.*]] = extractelement <16 x i16> [[A]], i32 12 +; SSE-NEXT: [[A13:%.*]] = extractelement <16 x i16> [[A]], i32 13 +; SSE-NEXT: [[A14:%.*]] = extractelement <16 x i16> [[A]], i32 14 +; SSE-NEXT: [[A15:%.*]] = extractelement <16 x i16> [[A]], i32 15 +; SSE-NEXT: [[B8:%.*]] = extractelement <16 x i16> [[B:%.*]], i32 8 +; SSE-NEXT: [[B9:%.*]] = extractelement <16 x i16> [[B]], i32 9 +; SSE-NEXT: [[B10:%.*]] = extractelement <16 x i16> [[B]], i32 10 +; SSE-NEXT: [[B11:%.*]] = extractelement <16 x i16> [[B]], i32 11 +; SSE-NEXT: [[B12:%.*]] = extractelement <16 x i16> [[B]], i32 12 +; SSE-NEXT: [[B13:%.*]] = extractelement <16 x i16> [[B]], i32 13 +; SSE-NEXT: [[B14:%.*]] = extractelement <16 x i16> [[B]], i32 14 +; SSE-NEXT: [[B15:%.*]] = extractelement <16 x i16> [[B]], i32 15 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[R8:%.*]] = add i16 [[A8]], [[A9]] +; SSE-NEXT: [[R9:%.*]] = add i16 [[A10]], [[A11]] +; SSE-NEXT: [[R10:%.*]] = add i16 [[A12]], [[A13]] +; SSE-NEXT: [[R11:%.*]] = add i16 [[A14]], [[A15]] +; SSE-NEXT: [[R12:%.*]] = add i16 [[B8]], [[B9]] +; SSE-NEXT: [[R13:%.*]] = add i16 [[B10]], [[B11]] +; SSE-NEXT: [[R14:%.*]] = add i16 [[B12]], [[B13]] +; SSE-NEXT: [[R15:%.*]] = add i16 [[B14]], [[B15]] +; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[RV0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[RV1:%.*]] = insertelement <16 x i16> [[RV0]], i16 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[RV2:%.*]] = insertelement <16 x i16> [[RV1]], i16 [[TMP6]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[RV3:%.*]] = insertelement <16 x i16> [[RV2]], i16 [[TMP7]], i32 3 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; SSE-NEXT: [[RV4:%.*]] = insertelement <16 x i16> [[RV3]], i16 [[TMP8]], i32 4 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; SSE-NEXT: [[RV5:%.*]] = insertelement <16 x i16> [[RV4]], i16 [[TMP9]], i32 5 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; SSE-NEXT: [[RV6:%.*]] = insertelement <16 x i16> [[RV5]], i16 [[TMP10]], i32 6 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; SSE-NEXT: [[RV7:%.*]] = insertelement <16 x i16> [[RV6]], i16 [[TMP11]], i32 7 +; SSE-NEXT: [[RV8:%.*]] = insertelement <16 x i16> [[RV7]], i16 [[R8]], i32 8 +; SSE-NEXT: [[RV9:%.*]] = insertelement <16 x i16> [[RV8]], i16 [[R9]], i32 9 +; SSE-NEXT: [[RV10:%.*]] = insertelement <16 x i16> [[RV9]], i16 [[R10]], i32 10 +; SSE-NEXT: [[RV11:%.*]] = insertelement <16 x i16> [[RV10]], i16 [[R11]], i32 11 +; SSE-NEXT: [[RV12:%.*]] = insertelement <16 x i16> [[RV11]], i16 [[R12]], i32 12 +; SSE-NEXT: [[RV13:%.*]] = insertelement <16 x i16> [[RV12]], i16 [[R13]], i32 13 +; SSE-NEXT: [[RV14:%.*]] = insertelement <16 x i16> [[RV13]], i16 [[R14]], i32 14 +; SSE-NEXT: [[RV15:%.*]] = insertelement <16 x i16> [[RV14]], i16 [[R15]], i32 15 ; SSE-NEXT: ret <16 x i16> [[RV15]] ; ; SLM-LABEL: @test_v16i16( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll @@ -163,13 +163,21 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @test_v4f64( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3 +; SSE-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i32 2 +; SSE-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SSE-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: [[R2:%.*]] = fsub double [[A2]], [[A3]] +; SSE-NEXT: [[R3:%.*]] = fsub double [[B2]], [[B3]] +; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; SSE-NEXT: [[R00:%.*]] = insertelement <4 x double> poison, double [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; SSE-NEXT: [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[TMP5]], i32 1 +; SSE-NEXT: [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2 +; SSE-NEXT: [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3 ; SSE-NEXT: ret <4 x double> [[R03]] ; ; SLM-LABEL: @test_v4f64( @@ -218,10 +226,14 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @test_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]] -; SSE-NEXT: ret <8 x float> [[TMP3]] +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP5]] +; SSE-NEXT: [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[R07]] ; ; SLM-LABEL: @test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> @@ -344,13 +356,57 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SSE-NEXT: [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i32 8 +; SSE-NEXT: [[A9:%.*]] = extractelement <16 x i16> [[A]], i32 9 +; SSE-NEXT: [[A10:%.*]] = extractelement <16 x i16> [[A]], i32 10 +; SSE-NEXT: [[A11:%.*]] = extractelement <16 x i16> [[A]], i32 11 +; SSE-NEXT: [[A12:%.*]] = extractelement <16 x i16> [[A]], i32 12 +; SSE-NEXT: [[A13:%.*]] = extractelement <16 x i16> [[A]], i32 13 +; SSE-NEXT: [[A14:%.*]] = extractelement <16 x i16> [[A]], i32 14 +; SSE-NEXT: [[A15:%.*]] = extractelement <16 x i16> [[A]], i32 15 +; SSE-NEXT: [[B8:%.*]] = extractelement <16 x i16> [[B:%.*]], i32 8 +; SSE-NEXT: [[B9:%.*]] = extractelement <16 x i16> [[B]], i32 9 +; SSE-NEXT: [[B10:%.*]] = extractelement <16 x i16> [[B]], i32 10 +; SSE-NEXT: [[B11:%.*]] = extractelement <16 x i16> [[B]], i32 11 +; SSE-NEXT: [[B12:%.*]] = extractelement <16 x i16> [[B]], i32 12 +; SSE-NEXT: [[B13:%.*]] = extractelement <16 x i16> [[B]], i32 13 +; SSE-NEXT: [[B14:%.*]] = extractelement <16 x i16> [[B]], i32 14 +; SSE-NEXT: [[B15:%.*]] = extractelement <16 x i16> [[B]], i32 15 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[R8:%.*]] = sub i16 [[A8]], [[A9]] +; SSE-NEXT: [[R9:%.*]] = sub i16 [[A10]], [[A11]] +; SSE-NEXT: [[R10:%.*]] = sub i16 [[A12]], [[A13]] +; SSE-NEXT: [[R11:%.*]] = sub i16 [[A14]], [[A15]] +; SSE-NEXT: [[R12:%.*]] = sub i16 [[B8]], [[B9]] +; SSE-NEXT: [[R13:%.*]] = sub i16 [[B10]], [[B11]] +; SSE-NEXT: [[R14:%.*]] = sub i16 [[B12]], [[B13]] +; SSE-NEXT: [[R15:%.*]] = sub i16 [[B14]], [[B15]] +; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[RV0:%.*]] = insertelement <16 x i16> poison, i16 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[RV1:%.*]] = insertelement <16 x i16> [[RV0]], i16 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[RV2:%.*]] = insertelement <16 x i16> [[RV1]], i16 [[TMP6]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[RV3:%.*]] = insertelement <16 x i16> [[RV2]], i16 [[TMP7]], i32 3 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; SSE-NEXT: [[RV4:%.*]] = insertelement <16 x i16> [[RV3]], i16 [[TMP8]], i32 4 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; SSE-NEXT: [[RV5:%.*]] = insertelement <16 x i16> [[RV4]], i16 [[TMP9]], i32 5 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; SSE-NEXT: [[RV6:%.*]] = insertelement <16 x i16> [[RV5]], i16 [[TMP10]], i32 6 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; SSE-NEXT: [[RV7:%.*]] = insertelement <16 x i16> [[RV6]], i16 [[TMP11]], i32 7 +; SSE-NEXT: [[RV8:%.*]] = insertelement <16 x i16> [[RV7]], i16 [[R8]], i32 8 +; SSE-NEXT: [[RV9:%.*]] = insertelement <16 x i16> [[RV8]], i16 [[R9]], i32 9 +; SSE-NEXT: [[RV10:%.*]] = insertelement <16 x i16> [[RV9]], i16 [[R10]], i32 10 +; SSE-NEXT: [[RV11:%.*]] = insertelement <16 x i16> [[RV10]], i16 [[R11]], i32 11 +; SSE-NEXT: [[RV12:%.*]] = insertelement <16 x i16> [[RV11]], i16 [[R12]], i32 12 +; SSE-NEXT: [[RV13:%.*]] = insertelement <16 x i16> [[RV12]], i16 [[R13]], i32 13 +; SSE-NEXT: [[RV14:%.*]] = insertelement <16 x i16> [[RV13]], i16 [[R14]], i32 14 +; SSE-NEXT: [[RV15:%.*]] = insertelement <16 x i16> [[RV14]], i16 [[R15]], i32 15 ; SSE-NEXT: ret <16 x i16> [[RV15]] ; ; SLM-LABEL: @test_v16i16( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll @@ -163,13 +163,21 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @test_v4f64( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3 +; SSE-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i32 2 +; SSE-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> ; SSE-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: [[R2:%.*]] = fsub double [[A2]], [[A3]] +; SSE-NEXT: [[R3:%.*]] = fsub double [[B2]], [[B3]] +; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; SSE-NEXT: [[R00:%.*]] = insertelement <4 x double> undef, double [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; SSE-NEXT: [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[TMP5]], i32 1 +; SSE-NEXT: [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2 +; SSE-NEXT: [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3 ; SSE-NEXT: ret <4 x double> [[R03]] ; ; SLM-LABEL: @test_v4f64( @@ -218,10 +226,14 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @test_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]] -; SSE-NEXT: ret <8 x float> [[TMP3]] +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP5]] +; SSE-NEXT: [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[R07]] ; ; SLM-LABEL: @test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> @@ -344,13 +356,57 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SSE-NEXT: [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i32 8 +; SSE-NEXT: [[A9:%.*]] = extractelement <16 x i16> [[A]], i32 9 +; SSE-NEXT: [[A10:%.*]] = extractelement <16 x i16> [[A]], i32 10 +; SSE-NEXT: [[A11:%.*]] = extractelement <16 x i16> [[A]], i32 11 +; SSE-NEXT: [[A12:%.*]] = extractelement <16 x i16> [[A]], i32 12 +; SSE-NEXT: [[A13:%.*]] = extractelement <16 x i16> [[A]], i32 13 +; SSE-NEXT: [[A14:%.*]] = extractelement <16 x i16> [[A]], i32 14 +; SSE-NEXT: [[A15:%.*]] = extractelement <16 x i16> [[A]], i32 15 +; SSE-NEXT: [[B8:%.*]] = extractelement <16 x i16> [[B:%.*]], i32 8 +; SSE-NEXT: [[B9:%.*]] = extractelement <16 x i16> [[B]], i32 9 +; SSE-NEXT: [[B10:%.*]] = extractelement <16 x i16> [[B]], i32 10 +; SSE-NEXT: [[B11:%.*]] = extractelement <16 x i16> [[B]], i32 11 +; SSE-NEXT: [[B12:%.*]] = extractelement <16 x i16> [[B]], i32 12 +; SSE-NEXT: [[B13:%.*]] = extractelement <16 x i16> [[B]], i32 13 +; SSE-NEXT: [[B14:%.*]] = extractelement <16 x i16> [[B]], i32 14 +; SSE-NEXT: [[B15:%.*]] = extractelement <16 x i16> [[B]], i32 15 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[R8:%.*]] = sub i16 [[A8]], [[A9]] +; SSE-NEXT: [[R9:%.*]] = sub i16 [[A10]], [[A11]] +; SSE-NEXT: [[R10:%.*]] = sub i16 [[A12]], [[A13]] +; SSE-NEXT: [[R11:%.*]] = sub i16 [[A14]], [[A15]] +; SSE-NEXT: [[R12:%.*]] = sub i16 [[B8]], [[B9]] +; SSE-NEXT: [[R13:%.*]] = sub i16 [[B10]], [[B11]] +; SSE-NEXT: [[R14:%.*]] = sub i16 [[B12]], [[B13]] +; SSE-NEXT: [[R15:%.*]] = sub i16 [[B14]], [[B15]] +; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[RV0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[RV1:%.*]] = insertelement <16 x i16> [[RV0]], i16 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[RV2:%.*]] = insertelement <16 x i16> [[RV1]], i16 [[TMP6]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[RV3:%.*]] = insertelement <16 x i16> [[RV2]], i16 [[TMP7]], i32 3 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; SSE-NEXT: [[RV4:%.*]] = insertelement <16 x i16> [[RV3]], i16 [[TMP8]], i32 4 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; SSE-NEXT: [[RV5:%.*]] = insertelement <16 x i16> [[RV4]], i16 [[TMP9]], i32 5 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; SSE-NEXT: [[RV6:%.*]] = insertelement <16 x i16> [[RV5]], i16 [[TMP10]], i32 6 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; SSE-NEXT: [[RV7:%.*]] = insertelement <16 x i16> [[RV6]], i16 [[TMP11]], i32 7 +; SSE-NEXT: [[RV8:%.*]] = insertelement <16 x i16> [[RV7]], i16 [[R8]], i32 8 +; SSE-NEXT: [[RV9:%.*]] = insertelement <16 x i16> [[RV8]], i16 [[R9]], i32 9 +; SSE-NEXT: [[RV10:%.*]] = insertelement <16 x i16> [[RV9]], i16 [[R10]], i32 10 +; SSE-NEXT: [[RV11:%.*]] = insertelement <16 x i16> [[RV10]], i16 [[R11]], i32 11 +; SSE-NEXT: [[RV12:%.*]] = insertelement <16 x i16> [[RV11]], i16 [[R12]], i32 12 +; SSE-NEXT: [[RV13:%.*]] = insertelement <16 x i16> [[RV12]], i16 [[R13]], i32 13 +; SSE-NEXT: [[RV14:%.*]] = insertelement <16 x i16> [[RV13]], i16 [[R14]], i32 14 +; SSE-NEXT: [[RV15:%.*]] = insertelement <16 x i16> [[RV14]], i16 [[R15]], i32 15 ; SSE-NEXT: ret <16 x i16> [[RV15]] ; ; SLM-LABEL: @test_v16i16( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll @@ -12,70 +12,56 @@ ; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1 ; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0 +; CHECK-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_1_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1 +; CHECK-NEXT: [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2 ; CHECK-NEXT: [[ARRAYIDX_I_I7_2_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2 +; CHECK-NEXT: [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3 ; CHECK-NEXT: [[ARRAYIDX_I_I7_3_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 3 +; CHECK-NEXT: [[SHR_3_I_I:%.*]] = lshr i32 [[CONV31_I]], 4 ; CHECK-NEXT: [[ARRAYIDX_I_I7_4_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 4 +; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5 ; CHECK-NEXT: [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5 +; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6 ; CHECK-NEXT: [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6 +; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7 ; CHECK-NEXT: [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CONV31_I]], i32 4 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CONV31_I]], i32 5 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CONV31_I]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CONV31_I]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = lshr <8 x i32> [[TMP8]], +; CHECK-NEXT: [[SHR_7_I_I:%.*]] = lshr i32 [[CONV31_I]], 8 ; CHECK-NEXT: [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8 +; CHECK-NEXT: [[SHR_8_I_I:%.*]] = lshr i32 [[CONV31_I]], 9 ; CHECK-NEXT: [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9 +; CHECK-NEXT: [[SHR_9_I_I:%.*]] = lshr i32 [[CONV31_I]], 10 ; CHECK-NEXT: [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10 +; CHECK-NEXT: [[SHR_10_I_I:%.*]] = lshr i32 [[CONV31_I]], 11 ; CHECK-NEXT: [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV31_I]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CONV31_I]], i32 2 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3 -; CHECK-NEXT: [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], +; CHECK-NEXT: [[SHR_11_I_I:%.*]] = lshr i32 [[CONV31_I]], 12 ; CHECK-NEXT: [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12 ; CHECK-NEXT: [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13 ; CHECK-NEXT: [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13 ; CHECK-NEXT: [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14 ; CHECK-NEXT: [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14 ; CHECK-NEXT: [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1 -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10 -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13 -; CHECK-NEXT: [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15 -; CHECK-NEXT: [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8> -; CHECK-NEXT: [[TMP44:%.*]] = and <16 x i8> [[TMP43]], +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[SHR_I_I]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[SHR_1_I_I]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[SHR_2_I_I]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[SHR_3_I_I]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[SHR_4_I_I]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[SHR_5_I_I]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SHR_6_I_I]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_7_I_I]], i32 8 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_8_I_I]], i32 9 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SHR_9_I_I]], i32 10 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[SHR_10_I_I]], i32 11 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[SHR_11_I_I]], i32 12 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[SHR_12_I_I]], i32 13 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[SHR_13_I_I]], i32 14 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[SHR_14_I_I]], i32 15 +; CHECK-NEXT: [[TMP17:%.*]] = trunc <16 x i32> [[TMP16]] to <16 x i8> +; CHECK-NEXT: [[TMP18:%.*]] = and <16 x i8> [[TMP17]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15 -; CHECK-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP18]], <16 x i8>* [[TMP19]], align 1 ; CHECK-NEXT: unreachable ; CHECK: if.end50.i: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll @@ -1,25 +1,36 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX512 ; ; vXi8 ; define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) { -; SSE-LABEL: @loadext_2i8_to_2i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: ret <2 x i64> [[V1]] +; SSE2-LABEL: @loadext_2i8_to_2i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 +; SSE2-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 +; SSE2-NEXT: ret <2 x i64> [[V1]] +; +; SLM-LABEL: @loadext_2i8_to_2i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SLM-NEXT: ret <2 x i64> [[V1]] ; ; AVX-LABEL: @loadext_2i8_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -47,35 +58,35 @@ ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 +; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 +; SSE2-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i32 +; SSE2-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i32 +; SSE2-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i32 +; SSE2-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i32 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 ; SSE2-NEXT: ret <4 x i32> [[V3]] ; ; SLM-LABEL: @loadext_4i8_to_4i32( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 ; SLM-NEXT: ret <4 x i32> [[V3]] ; ; AVX-LABEL: @loadext_4i8_to_4i32( @@ -114,40 +125,93 @@ } define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) { -; SSE-LABEL: @loadext_4i8_to_4i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64 -; SSE-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64 -; SSE-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i64> [[V3]] +; SSE2-LABEL: @loadext_4i8_to_4i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 +; SSE2-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64 +; SSE2-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64 +; SSE2-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SSE2-NEXT: ret <4 x i64> [[V3]] ; -; AVX-LABEL: @loadext_4i8_to_4i64( -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; SLM-LABEL: @loadext_4i8_to_4i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; SLM-NEXT: ret <4 x i64> [[V3]] +; +; AVX1-LABEL: @loadext_4i8_to_4i64( +; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX1-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; AVX1-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 +; AVX1-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 +; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64 +; AVX1-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: ret <4 x i64> [[V3]] +; +; AVX2-LABEL: @loadext_4i8_to_4i64( +; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; AVX2-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX2-NEXT: ret <4 x i64> [[V3]] +; +; AVX512-LABEL: @loadext_4i8_to_4i64( +; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; AVX512-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX512-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -168,68 +232,34 @@ } define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) { -; SSE2-LABEL: @loadext_8i8_to_8i16( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7 -; SSE2-NEXT: ret <8 x i16> [[V7]] -; -; SLM-LABEL: @loadext_8i8_to_8i16( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i16> poison, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7 -; SLM-NEXT: ret <8 x i16> [[V7]] +; SSE-LABEL: @loadext_8i8_to_8i16( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 +; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 +; SSE-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 +; SSE-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16> +; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[V0:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; SSE-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; SSE-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; SSE-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; SSE-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7 +; SSE-NEXT: ret <8 x i16> [[V7]] ; ; AVX-LABEL: @loadext_8i8_to_8i16( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -295,68 +325,34 @@ } define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) { -; SSE2-LABEL: @loadext_8i8_to_8i32( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; SSE2-NEXT: ret <8 x i32> [[V7]] -; -; SLM-LABEL: @loadext_8i8_to_8i32( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i32 -; SLM-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i32 -; SLM-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i32 -; SLM-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i32 -; SLM-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 -; SLM-NEXT: ret <8 x i32> [[V7]] +; SSE-LABEL: @loadext_8i8_to_8i32( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 +; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 +; SSE-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 +; SSE-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32> +; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 +; SSE-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 +; SSE-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 +; SSE-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 +; SSE-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 +; SSE-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 +; SSE-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 +; SSE-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 +; SSE-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 +; SSE-NEXT: ret <8 x i32> [[V7]] ; ; AVX-LABEL: @loadext_8i8_to_8i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -422,124 +418,58 @@ } define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) { -; SSE2-LABEL: @loadext_16i8_to_16i16( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SSE2-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8 -; SSE2-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9 -; SSE2-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10 -; SSE2-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11 -; SSE2-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12 -; SSE2-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 -; SSE2-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 -; SSE2-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <16 x i16> poison, i16 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7 -; SSE2-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8 -; SSE2-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8 -; SSE2-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9 -; SSE2-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9 -; SSE2-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10 -; SSE2-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10 -; SSE2-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11 -; SSE2-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11 -; SSE2-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12 -; SSE2-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12 -; SSE2-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13 -; SSE2-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13 -; SSE2-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14 -; SSE2-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14 -; SSE2-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15 -; SSE2-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15 -; SSE2-NEXT: ret <16 x i16> [[V15]] -; -; SLM-LABEL: @loadext_16i8_to_16i16( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8 -; SLM-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9 -; SLM-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10 -; SLM-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11 -; SLM-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12 -; SLM-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 -; SLM-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 -; SLM-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[I8:%.*]] = load i8, i8* [[P8]], align 1 -; SLM-NEXT: [[I9:%.*]] = load i8, i8* [[P9]], align 1 -; SLM-NEXT: [[I10:%.*]] = load i8, i8* [[P10]], align 1 -; SLM-NEXT: [[I11:%.*]] = load i8, i8* [[P11]], align 1 -; SLM-NEXT: [[I12:%.*]] = load i8, i8* [[P12]], align 1 -; SLM-NEXT: [[I13:%.*]] = load i8, i8* [[P13]], align 1 -; SLM-NEXT: [[I14:%.*]] = load i8, i8* [[P14]], align 1 -; SLM-NEXT: [[I15:%.*]] = load i8, i8* [[P15]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i16 -; SLM-NEXT: [[X8:%.*]] = sext i8 [[I8]] to i16 -; SLM-NEXT: [[X9:%.*]] = sext i8 [[I9]] to i16 -; SLM-NEXT: [[X10:%.*]] = sext i8 [[I10]] to i16 -; SLM-NEXT: [[X11:%.*]] = sext i8 [[I11]] to i16 -; SLM-NEXT: [[X12:%.*]] = sext i8 [[I12]] to i16 -; SLM-NEXT: [[X13:%.*]] = sext i8 [[I13]] to i16 -; SLM-NEXT: [[X14:%.*]] = sext i8 [[I14]] to i16 -; SLM-NEXT: [[X15:%.*]] = sext i8 [[I15]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <16 x i16> poison, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7 -; SLM-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8 -; SLM-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9 -; SLM-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10 -; SLM-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11 -; SLM-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12 -; SLM-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13 -; SLM-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14 -; SLM-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15 -; SLM-NEXT: ret <16 x i16> [[V15]] +; SSE-LABEL: @loadext_16i8_to_16i16( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 +; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 +; SSE-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 +; SSE-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 +; SSE-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8 +; SSE-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9 +; SSE-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10 +; SSE-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11 +; SSE-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12 +; SSE-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 +; SSE-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 +; SSE-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16> +; SSE-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[V0:%.*]] = insertelement <16 x i16> poison, i16 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4 +; SSE-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5 +; SSE-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6 +; SSE-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7 +; SSE-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7 +; SSE-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8 +; SSE-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8 +; SSE-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9 +; SSE-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9 +; SSE-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10 +; SSE-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10 +; SSE-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11 +; SSE-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11 +; SSE-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12 +; SSE-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12 +; SSE-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13 +; SSE-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13 +; SSE-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14 +; SSE-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14 +; SSE-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15 +; SSE-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15 +; SSE-NEXT: ret <16 x i16> [[V15]] ; ; AVX-LABEL: @loadext_16i8_to_16i16( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -665,15 +595,26 @@ ; define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) { -; SSE-LABEL: @loadext_2i16_to_2i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: ret <2 x i64> [[V1]] +; SSE2-LABEL: @loadext_2i16_to_2i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 +; SSE2-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 +; SSE2-NEXT: ret <2 x i64> [[V1]] +; +; SLM-LABEL: @loadext_2i16_to_2i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SLM-NEXT: ret <2 x i64> [[V1]] ; ; AVX-LABEL: @loadext_2i16_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -697,40 +638,22 @@ } define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) { -; SSE2-LABEL: @loadext_4i16_to_4i32( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: ret <4 x i32> [[V3]] -; -; SLM-LABEL: @loadext_4i16_to_4i32( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: ret <4 x i32> [[V3]] +; SSE-LABEL: @loadext_4i16_to_4i32( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 +; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 +; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 +; SSE-NEXT: ret <4 x i32> [[V3]] ; ; AVX-LABEL: @loadext_4i16_to_4i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -768,40 +691,93 @@ } define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) { -; SSE-LABEL: @loadext_4i16_to_4i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64 -; SSE-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 -; SSE-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i64> [[V3]] +; SSE2-LABEL: @loadext_4i16_to_4i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; SSE2-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 +; SSE2-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64 +; SSE2-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 +; SSE2-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SSE2-NEXT: ret <4 x i64> [[V3]] ; -; AVX-LABEL: @loadext_4i16_to_4i64( -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; SLM-LABEL: @loadext_4i16_to_4i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; SLM-NEXT: ret <4 x i64> [[V3]] +; +; AVX1-LABEL: @loadext_4i16_to_4i64( +; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX1-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 +; AVX1-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 +; AVX1-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 +; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 +; AVX1-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: ret <4 x i64> [[V3]] +; +; AVX2-LABEL: @loadext_4i16_to_4i64( +; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; AVX2-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX2-NEXT: ret <4 x i64> [[V3]] +; +; AVX512-LABEL: @loadext_4i16_to_4i64( +; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; AVX512-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX512-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -822,68 +798,34 @@ } define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) { -; SSE2-LABEL: @loadext_8i16_to_8i32( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4 -; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 -; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 -; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* -; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; SSE2-NEXT: ret <8 x i32> [[V7]] -; -; SLM-LABEL: @loadext_8i16_to_8i32( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4 -; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 -; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 -; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i16, i16* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i16, i16* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i16, i16* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i16, i16* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i32 -; SLM-NEXT: [[X4:%.*]] = sext i16 [[I4]] to i32 -; SLM-NEXT: [[X5:%.*]] = sext i16 [[I5]] to i32 -; SLM-NEXT: [[X6:%.*]] = sext i16 [[I6]] to i32 -; SLM-NEXT: [[X7:%.*]] = sext i16 [[I7]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 -; SLM-NEXT: ret <8 x i32> [[V7]] +; SSE-LABEL: @loadext_8i16_to_8i32( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4 +; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 +; SSE-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 +; SSE-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32> +; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 +; SSE-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 +; SSE-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 +; SSE-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 +; SSE-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 +; SSE-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 +; SSE-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 +; SSE-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 +; SSE-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 +; SSE-NEXT: ret <8 x i32> [[V7]] ; ; AVX-LABEL: @loadext_8i16_to_8i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -953,15 +895,26 @@ ; define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) { -; SSE-LABEL: @loadext_2i32_to_2i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; SSE-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: ret <2 x i64> [[V1]] +; SSE2-LABEL: @loadext_2i32_to_2i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 +; SSE2-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 +; SSE2-NEXT: ret <2 x i64> [[V1]] +; +; SLM-LABEL: @loadext_2i32_to_2i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SLM-NEXT: ret <2 x i64> [[V1]] ; ; AVX-LABEL: @loadext_2i32_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 @@ -985,40 +938,93 @@ } define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) { -; SSE-LABEL: @loadext_4i32_to_4i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 -; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 -; SSE-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64 -; SSE-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i64> [[V3]] +; SSE2-LABEL: @loadext_4i32_to_4i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; SSE2-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 +; SSE2-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 +; SSE2-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64 +; SSE2-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SSE2-NEXT: ret <4 x i64> [[V3]] ; -; AVX-LABEL: @loadext_4i32_to_4i64( -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; SLM-LABEL: @loadext_4i32_to_4i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; SLM-NEXT: ret <4 x i64> [[V3]] +; +; AVX1-LABEL: @loadext_4i32_to_4i64( +; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; AVX1-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 +; AVX1-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 +; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64 +; AVX1-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: ret <4 x i64> [[V3]] +; +; AVX2-LABEL: @loadext_4i32_to_4i64( +; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; AVX2-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX2-NEXT: ret <4 x i64> [[V3]] +; +; AVX512-LABEL: @loadext_4i32_to_4i64( +; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* +; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; AVX512-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX512-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i32, i32* %p0, i64 1 %p2 = getelementptr inbounds i32, i32* %p0, i64 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll @@ -1,25 +1,36 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX512 ; ; vXi8 ; define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) { -; SSE-LABEL: @loadext_2i8_to_2i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: ret <2 x i64> [[V1]] +; SSE2-LABEL: @loadext_2i8_to_2i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 +; SSE2-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 +; SSE2-NEXT: ret <2 x i64> [[V1]] +; +; SLM-LABEL: @loadext_2i8_to_2i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SLM-NEXT: ret <2 x i64> [[V1]] ; ; AVX-LABEL: @loadext_2i8_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -47,35 +58,35 @@ ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 +; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 +; SSE2-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i32 +; SSE2-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i32 +; SSE2-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i32 +; SSE2-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i32 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 ; SSE2-NEXT: ret <4 x i32> [[V3]] ; ; SLM-LABEL: @loadext_4i8_to_4i32( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 ; SLM-NEXT: ret <4 x i32> [[V3]] ; ; AVX-LABEL: @loadext_4i8_to_4i32( @@ -114,40 +125,93 @@ } define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) { -; SSE-LABEL: @loadext_4i8_to_4i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64 -; SSE-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64 -; SSE-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i64> [[V3]] +; SSE2-LABEL: @loadext_4i8_to_4i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 +; SSE2-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64 +; SSE2-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64 +; SSE2-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SSE2-NEXT: ret <4 x i64> [[V3]] ; -; AVX-LABEL: @loadext_4i8_to_4i64( -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; SLM-LABEL: @loadext_4i8_to_4i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; SLM-NEXT: ret <4 x i64> [[V3]] +; +; AVX1-LABEL: @loadext_4i8_to_4i64( +; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX1-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; AVX1-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 +; AVX1-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 +; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64 +; AVX1-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: ret <4 x i64> [[V3]] +; +; AVX2-LABEL: @loadext_4i8_to_4i64( +; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; AVX2-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX2-NEXT: ret <4 x i64> [[V3]] +; +; AVX512-LABEL: @loadext_4i8_to_4i64( +; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; AVX512-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX512-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -168,68 +232,34 @@ } define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) { -; SSE2-LABEL: @loadext_8i8_to_8i16( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7 -; SSE2-NEXT: ret <8 x i16> [[V7]] -; -; SLM-LABEL: @loadext_8i8_to_8i16( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7 -; SLM-NEXT: ret <8 x i16> [[V7]] +; SSE-LABEL: @loadext_8i8_to_8i16( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 +; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 +; SSE-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 +; SSE-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16> +; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; SSE-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; SSE-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; SSE-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; SSE-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7 +; SSE-NEXT: ret <8 x i16> [[V7]] ; ; AVX-LABEL: @loadext_8i8_to_8i16( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -295,68 +325,34 @@ } define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) { -; SSE2-LABEL: @loadext_8i8_to_8i32( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; SSE2-NEXT: ret <8 x i32> [[V7]] -; -; SLM-LABEL: @loadext_8i8_to_8i32( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i32 -; SLM-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i32 -; SLM-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i32 -; SLM-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i32 -; SLM-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 -; SLM-NEXT: ret <8 x i32> [[V7]] +; SSE-LABEL: @loadext_8i8_to_8i32( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 +; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 +; SSE-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 +; SSE-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32> +; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 +; SSE-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 +; SSE-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 +; SSE-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 +; SSE-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 +; SSE-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 +; SSE-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 +; SSE-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 +; SSE-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 +; SSE-NEXT: ret <8 x i32> [[V7]] ; ; AVX-LABEL: @loadext_8i8_to_8i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -422,124 +418,58 @@ } define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) { -; SSE2-LABEL: @loadext_16i8_to_16i16( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SSE2-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8 -; SSE2-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9 -; SSE2-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10 -; SSE2-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11 -; SSE2-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12 -; SSE2-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 -; SSE2-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 -; SSE2-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7 -; SSE2-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8 -; SSE2-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8 -; SSE2-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9 -; SSE2-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9 -; SSE2-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10 -; SSE2-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10 -; SSE2-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11 -; SSE2-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11 -; SSE2-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12 -; SSE2-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12 -; SSE2-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13 -; SSE2-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13 -; SSE2-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14 -; SSE2-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14 -; SSE2-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15 -; SSE2-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15 -; SSE2-NEXT: ret <16 x i16> [[V15]] -; -; SLM-LABEL: @loadext_16i8_to_16i16( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8 -; SLM-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9 -; SLM-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10 -; SLM-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11 -; SLM-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12 -; SLM-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 -; SLM-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 -; SLM-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[I8:%.*]] = load i8, i8* [[P8]], align 1 -; SLM-NEXT: [[I9:%.*]] = load i8, i8* [[P9]], align 1 -; SLM-NEXT: [[I10:%.*]] = load i8, i8* [[P10]], align 1 -; SLM-NEXT: [[I11:%.*]] = load i8, i8* [[P11]], align 1 -; SLM-NEXT: [[I12:%.*]] = load i8, i8* [[P12]], align 1 -; SLM-NEXT: [[I13:%.*]] = load i8, i8* [[P13]], align 1 -; SLM-NEXT: [[I14:%.*]] = load i8, i8* [[P14]], align 1 -; SLM-NEXT: [[I15:%.*]] = load i8, i8* [[P15]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i16 -; SLM-NEXT: [[X8:%.*]] = sext i8 [[I8]] to i16 -; SLM-NEXT: [[X9:%.*]] = sext i8 [[I9]] to i16 -; SLM-NEXT: [[X10:%.*]] = sext i8 [[I10]] to i16 -; SLM-NEXT: [[X11:%.*]] = sext i8 [[I11]] to i16 -; SLM-NEXT: [[X12:%.*]] = sext i8 [[I12]] to i16 -; SLM-NEXT: [[X13:%.*]] = sext i8 [[I13]] to i16 -; SLM-NEXT: [[X14:%.*]] = sext i8 [[I14]] to i16 -; SLM-NEXT: [[X15:%.*]] = sext i8 [[I15]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7 -; SLM-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8 -; SLM-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9 -; SLM-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10 -; SLM-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11 -; SLM-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12 -; SLM-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13 -; SLM-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14 -; SLM-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15 -; SLM-NEXT: ret <16 x i16> [[V15]] +; SSE-LABEL: @loadext_16i8_to_16i16( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 +; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 +; SSE-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 +; SSE-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 +; SSE-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8 +; SSE-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9 +; SSE-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10 +; SSE-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11 +; SSE-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12 +; SSE-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 +; SSE-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 +; SSE-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16> +; SSE-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4 +; SSE-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5 +; SSE-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6 +; SSE-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7 +; SSE-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7 +; SSE-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8 +; SSE-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8 +; SSE-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9 +; SSE-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9 +; SSE-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10 +; SSE-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10 +; SSE-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11 +; SSE-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11 +; SSE-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12 +; SSE-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12 +; SSE-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13 +; SSE-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13 +; SSE-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14 +; SSE-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14 +; SSE-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15 +; SSE-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15 +; SSE-NEXT: ret <16 x i16> [[V15]] ; ; AVX-LABEL: @loadext_16i8_to_16i16( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -665,15 +595,26 @@ ; define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) { -; SSE-LABEL: @loadext_2i16_to_2i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: ret <2 x i64> [[V1]] +; SSE2-LABEL: @loadext_2i16_to_2i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 +; SSE2-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 +; SSE2-NEXT: ret <2 x i64> [[V1]] +; +; SLM-LABEL: @loadext_2i16_to_2i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SLM-NEXT: ret <2 x i64> [[V1]] ; ; AVX-LABEL: @loadext_2i16_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -697,40 +638,22 @@ } define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) { -; SSE2-LABEL: @loadext_4i16_to_4i32( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: ret <4 x i32> [[V3]] -; -; SLM-LABEL: @loadext_4i16_to_4i32( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: ret <4 x i32> [[V3]] +; SSE-LABEL: @loadext_4i16_to_4i32( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 +; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 +; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 +; SSE-NEXT: ret <4 x i32> [[V3]] ; ; AVX-LABEL: @loadext_4i16_to_4i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -768,40 +691,93 @@ } define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) { -; SSE-LABEL: @loadext_4i16_to_4i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64 -; SSE-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 -; SSE-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i64> [[V3]] +; SSE2-LABEL: @loadext_4i16_to_4i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; SSE2-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 +; SSE2-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64 +; SSE2-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 +; SSE2-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SSE2-NEXT: ret <4 x i64> [[V3]] ; -; AVX-LABEL: @loadext_4i16_to_4i64( -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; SLM-LABEL: @loadext_4i16_to_4i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; SLM-NEXT: ret <4 x i64> [[V3]] +; +; AVX1-LABEL: @loadext_4i16_to_4i64( +; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX1-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 +; AVX1-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 +; AVX1-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 +; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 +; AVX1-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: ret <4 x i64> [[V3]] +; +; AVX2-LABEL: @loadext_4i16_to_4i64( +; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; AVX2-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX2-NEXT: ret <4 x i64> [[V3]] +; +; AVX512-LABEL: @loadext_4i16_to_4i64( +; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; AVX512-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX512-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -822,68 +798,34 @@ } define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) { -; SSE2-LABEL: @loadext_8i16_to_8i32( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4 -; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 -; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 -; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* -; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; SSE2-NEXT: ret <8 x i32> [[V7]] -; -; SLM-LABEL: @loadext_8i16_to_8i32( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4 -; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 -; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 -; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i16, i16* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i16, i16* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i16, i16* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i16, i16* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i32 -; SLM-NEXT: [[X4:%.*]] = sext i16 [[I4]] to i32 -; SLM-NEXT: [[X5:%.*]] = sext i16 [[I5]] to i32 -; SLM-NEXT: [[X6:%.*]] = sext i16 [[I6]] to i32 -; SLM-NEXT: [[X7:%.*]] = sext i16 [[I7]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 -; SLM-NEXT: ret <8 x i32> [[V7]] +; SSE-LABEL: @loadext_8i16_to_8i32( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4 +; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 +; SSE-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 +; SSE-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32> +; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 +; SSE-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 +; SSE-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 +; SSE-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 +; SSE-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 +; SSE-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 +; SSE-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 +; SSE-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 +; SSE-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 +; SSE-NEXT: ret <8 x i32> [[V7]] ; ; AVX-LABEL: @loadext_8i16_to_8i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -953,15 +895,26 @@ ; define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) { -; SSE-LABEL: @loadext_2i32_to_2i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; SSE-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: ret <2 x i64> [[V1]] +; SSE2-LABEL: @loadext_2i32_to_2i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 +; SSE2-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 +; SSE2-NEXT: ret <2 x i64> [[V1]] +; +; SLM-LABEL: @loadext_2i32_to_2i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SLM-NEXT: ret <2 x i64> [[V1]] ; ; AVX-LABEL: @loadext_2i32_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 @@ -985,40 +938,93 @@ } define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) { -; SSE-LABEL: @loadext_4i32_to_4i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 -; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 -; SSE-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64 -; SSE-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i64> [[V3]] +; SSE2-LABEL: @loadext_4i32_to_4i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; SSE2-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 +; SSE2-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 +; SSE2-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64 +; SSE2-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SSE2-NEXT: ret <4 x i64> [[V3]] ; -; AVX-LABEL: @loadext_4i32_to_4i64( -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; SLM-LABEL: @loadext_4i32_to_4i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; SLM-NEXT: ret <4 x i64> [[V3]] +; +; AVX1-LABEL: @loadext_4i32_to_4i64( +; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; AVX1-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 +; AVX1-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 +; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64 +; AVX1-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: ret <4 x i64> [[V3]] +; +; AVX2-LABEL: @loadext_4i32_to_4i64( +; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; AVX2-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX2-NEXT: ret <4 x i64> [[V3]] +; +; AVX512-LABEL: @loadext_4i32_to_4i64( +; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* +; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; AVX512-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX512-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i32, i32* %p0, i64 1 %p2 = getelementptr inbounds i32, i32* %p0, i64 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/zext-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/zext-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/zext-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/zext-inseltpoison.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE2 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX512 ; ; vXi8 @@ -13,23 +13,23 @@ define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) { ; SSE2-LABEL: @loadext_2i8_to_2i64( ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 +; SSE2-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 ; SSE2-NEXT: ret <2 x i64> [[V1]] ; ; SLM-LABEL: @loadext_2i8_to_2i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 ; SLM-NEXT: ret <2 x i64> [[V1]] ; ; AVX-LABEL: @loadext_2i8_to_2i64( @@ -75,18 +75,17 @@ ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 ; SLM-NEXT: ret <4 x i32> [[V3]] ; ; AVX-LABEL: @loadext_4i8_to_4i32( @@ -129,53 +128,89 @@ ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 +; SSE2-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64 +; SSE2-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64 +; SSE2-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; SSE2-NEXT: ret <4 x i64> [[V3]] ; ; SLM-LABEL: @loadext_4i8_to_4i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 ; SLM-NEXT: ret <4 x i64> [[V3]] ; -; AVX-LABEL: @loadext_4i8_to_4i64( -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX1-LABEL: @loadext_4i8_to_4i64( +; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX1-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; AVX1-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 +; AVX1-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 +; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64 +; AVX1-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: ret <4 x i64> [[V3]] +; +; AVX2-LABEL: @loadext_4i8_to_4i64( +; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; AVX2-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX2-NEXT: ret <4 x i64> [[V3]] +; +; AVX512-LABEL: @loadext_4i8_to_4i64( +; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; AVX512-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX512-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -233,30 +268,25 @@ ; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i16> poison, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3 +; SLM-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4 +; SLM-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5 +; SLM-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6 +; SLM-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7 ; SLM-NEXT: ret <8 x i16> [[V7]] ; ; AVX-LABEL: @loadext_8i8_to_8i16( @@ -360,30 +390,25 @@ ; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i32 -; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i32 -; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i32 -; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i32 -; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 +; SLM-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 +; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 +; SLM-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 +; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 +; SLM-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 +; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 +; SLM-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 +; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 ; SLM-NEXT: ret <8 x i32> [[V7]] ; ; AVX-LABEL: @loadext_8i8_to_8i32( @@ -519,54 +544,41 @@ ; SLM-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 ; SLM-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 ; SLM-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[I8:%.*]] = load i8, i8* [[P8]], align 1 -; SLM-NEXT: [[I9:%.*]] = load i8, i8* [[P9]], align 1 -; SLM-NEXT: [[I10:%.*]] = load i8, i8* [[P10]], align 1 -; SLM-NEXT: [[I11:%.*]] = load i8, i8* [[P11]], align 1 -; SLM-NEXT: [[I12:%.*]] = load i8, i8* [[P12]], align 1 -; SLM-NEXT: [[I13:%.*]] = load i8, i8* [[P13]], align 1 -; SLM-NEXT: [[I14:%.*]] = load i8, i8* [[P14]], align 1 -; SLM-NEXT: [[I15:%.*]] = load i8, i8* [[P15]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i16 -; SLM-NEXT: [[X8:%.*]] = zext i8 [[I8]] to i16 -; SLM-NEXT: [[X9:%.*]] = zext i8 [[I9]] to i16 -; SLM-NEXT: [[X10:%.*]] = zext i8 [[I10]] to i16 -; SLM-NEXT: [[X11:%.*]] = zext i8 [[I11]] to i16 -; SLM-NEXT: [[X12:%.*]] = zext i8 [[I12]] to i16 -; SLM-NEXT: [[X13:%.*]] = zext i8 [[I13]] to i16 -; SLM-NEXT: [[X14:%.*]] = zext i8 [[I14]] to i16 -; SLM-NEXT: [[X15:%.*]] = zext i8 [[I15]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <16 x i16> poison, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7 -; SLM-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8 -; SLM-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9 -; SLM-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10 -; SLM-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11 -; SLM-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12 -; SLM-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13 -; SLM-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14 -; SLM-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <16 x i16> poison, i16 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3 +; SLM-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4 +; SLM-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4 +; SLM-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5 +; SLM-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5 +; SLM-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6 +; SLM-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6 +; SLM-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7 +; SLM-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7 +; SLM-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8 +; SLM-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8 +; SLM-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9 +; SLM-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9 +; SLM-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10 +; SLM-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10 +; SLM-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11 +; SLM-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11 +; SLM-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12 +; SLM-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12 +; SLM-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13 +; SLM-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13 +; SLM-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14 +; SLM-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14 +; SLM-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15 +; SLM-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15 ; SLM-NEXT: ret <16 x i16> [[V15]] ; ; AVX-LABEL: @loadext_16i8_to_16i16( @@ -695,23 +707,23 @@ define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) { ; SSE2-LABEL: @loadext_2i16_to_2i64( ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* -; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SSE2-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 +; SSE2-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 ; SSE2-NEXT: ret <2 x i64> [[V1]] ; ; SLM-LABEL: @loadext_2i16_to_2i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 ; SLM-NEXT: ret <2 x i64> [[V1]] ; ; AVX-LABEL: @loadext_2i16_to_2i64( @@ -757,18 +769,17 @@ ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 ; SLM-NEXT: ret <4 x i32> [[V3]] ; ; AVX-LABEL: @loadext_4i16_to_4i32( @@ -811,53 +822,89 @@ ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; SSE2-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 +; SSE2-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64 +; SSE2-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64 +; SSE2-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; SSE2-NEXT: ret <4 x i64> [[V3]] ; ; SLM-LABEL: @loadext_4i16_to_4i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64 -; SLM-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64 -; SLM-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 ; SLM-NEXT: ret <4 x i64> [[V3]] ; -; AVX-LABEL: @loadext_4i16_to_4i64( -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX1-LABEL: @loadext_4i16_to_4i64( +; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX1-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 +; AVX1-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 +; AVX1-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 +; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64 +; AVX1-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: ret <4 x i64> [[V3]] +; +; AVX2-LABEL: @loadext_4i16_to_4i64( +; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; AVX2-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX2-NEXT: ret <4 x i64> [[V3]] +; +; AVX512-LABEL: @loadext_4i16_to_4i64( +; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; AVX512-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX512-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -915,30 +962,25 @@ ; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 ; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 ; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i16, i16* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i16, i16* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i16, i16* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i16, i16* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i32 -; SLM-NEXT: [[X4:%.*]] = zext i16 [[I4]] to i32 -; SLM-NEXT: [[X5:%.*]] = zext i16 [[I5]] to i32 -; SLM-NEXT: [[X6:%.*]] = zext i16 [[I6]] to i32 -; SLM-NEXT: [[X7:%.*]] = zext i16 [[I7]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 +; SLM-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 +; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 +; SLM-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 +; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 +; SLM-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 +; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 +; SLM-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 +; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 ; SLM-NEXT: ret <8 x i32> [[V7]] ; ; AVX-LABEL: @loadext_8i16_to_8i32( @@ -1022,12 +1064,13 @@ ; ; SLM-LABEL: @loadext_2i32_to_2i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; SLM-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 ; SLM-NEXT: ret <2 x i64> [[V1]] ; ; AVX-LABEL: @loadext_2i32_to_2i64( @@ -1056,53 +1099,90 @@ ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; SSE2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 +; SSE2-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> +; SSE2-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64 +; SSE2-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64 +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 ; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 ; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; SSE2-NEXT: ret <4 x i64> [[V3]] ; ; SLM-LABEL: @loadext_4i32_to_4i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64 -; SLM-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64 -; SLM-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 ; SLM-NEXT: ret <4 x i64> [[V3]] ; -; AVX-LABEL: @loadext_4i32_to_4i64( -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX1-LABEL: @loadext_4i32_to_4i64( +; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; AVX1-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 +; AVX1-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 +; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64 +; AVX1-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: ret <4 x i64> [[V3]] +; +; AVX2-LABEL: @loadext_4i32_to_4i64( +; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; AVX2-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX2-NEXT: ret <4 x i64> [[V3]] +; +; AVX512-LABEL: @loadext_4i32_to_4i64( +; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* +; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; AVX512-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX512-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i32, i32* %p0, i64 1 %p2 = getelementptr inbounds i32, i32* %p0, i64 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE2 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX512 ; ; vXi8 @@ -13,23 +13,23 @@ define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) { ; SSE2-LABEL: @loadext_2i8_to_2i64( ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 +; SSE2-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 ; SSE2-NEXT: ret <2 x i64> [[V1]] ; ; SLM-LABEL: @loadext_2i8_to_2i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 ; SLM-NEXT: ret <2 x i64> [[V1]] ; ; AVX-LABEL: @loadext_2i8_to_2i64( @@ -75,18 +75,17 @@ ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 ; SLM-NEXT: ret <4 x i32> [[V3]] ; ; AVX-LABEL: @loadext_4i8_to_4i32( @@ -129,53 +128,89 @@ ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 +; SSE2-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64 +; SSE2-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64 +; SSE2-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; SSE2-NEXT: ret <4 x i64> [[V3]] ; ; SLM-LABEL: @loadext_4i8_to_4i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 ; SLM-NEXT: ret <4 x i64> [[V3]] ; -; AVX-LABEL: @loadext_4i8_to_4i64( -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX1-LABEL: @loadext_4i8_to_4i64( +; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX1-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; AVX1-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 +; AVX1-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 +; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64 +; AVX1-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: ret <4 x i64> [[V3]] +; +; AVX2-LABEL: @loadext_4i8_to_4i64( +; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; AVX2-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX2-NEXT: ret <4 x i64> [[V3]] +; +; AVX512-LABEL: @loadext_4i8_to_4i64( +; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; AVX512-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX512-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -233,30 +268,25 @@ ; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3 +; SLM-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4 +; SLM-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5 +; SLM-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6 +; SLM-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7 ; SLM-NEXT: ret <8 x i16> [[V7]] ; ; AVX-LABEL: @loadext_8i8_to_8i16( @@ -360,30 +390,25 @@ ; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i32 -; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i32 -; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i32 -; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i32 -; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 +; SLM-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 +; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 +; SLM-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 +; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 +; SLM-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 +; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 +; SLM-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 +; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 ; SLM-NEXT: ret <8 x i32> [[V7]] ; ; AVX-LABEL: @loadext_8i8_to_8i32( @@ -519,54 +544,41 @@ ; SLM-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 ; SLM-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 ; SLM-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[I8:%.*]] = load i8, i8* [[P8]], align 1 -; SLM-NEXT: [[I9:%.*]] = load i8, i8* [[P9]], align 1 -; SLM-NEXT: [[I10:%.*]] = load i8, i8* [[P10]], align 1 -; SLM-NEXT: [[I11:%.*]] = load i8, i8* [[P11]], align 1 -; SLM-NEXT: [[I12:%.*]] = load i8, i8* [[P12]], align 1 -; SLM-NEXT: [[I13:%.*]] = load i8, i8* [[P13]], align 1 -; SLM-NEXT: [[I14:%.*]] = load i8, i8* [[P14]], align 1 -; SLM-NEXT: [[I15:%.*]] = load i8, i8* [[P15]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i16 -; SLM-NEXT: [[X8:%.*]] = zext i8 [[I8]] to i16 -; SLM-NEXT: [[X9:%.*]] = zext i8 [[I9]] to i16 -; SLM-NEXT: [[X10:%.*]] = zext i8 [[I10]] to i16 -; SLM-NEXT: [[X11:%.*]] = zext i8 [[I11]] to i16 -; SLM-NEXT: [[X12:%.*]] = zext i8 [[I12]] to i16 -; SLM-NEXT: [[X13:%.*]] = zext i8 [[I13]] to i16 -; SLM-NEXT: [[X14:%.*]] = zext i8 [[I14]] to i16 -; SLM-NEXT: [[X15:%.*]] = zext i8 [[I15]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7 -; SLM-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8 -; SLM-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9 -; SLM-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10 -; SLM-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11 -; SLM-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12 -; SLM-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13 -; SLM-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14 -; SLM-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3 +; SLM-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4 +; SLM-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4 +; SLM-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5 +; SLM-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5 +; SLM-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6 +; SLM-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6 +; SLM-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7 +; SLM-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7 +; SLM-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8 +; SLM-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8 +; SLM-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9 +; SLM-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9 +; SLM-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10 +; SLM-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10 +; SLM-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11 +; SLM-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11 +; SLM-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12 +; SLM-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12 +; SLM-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13 +; SLM-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13 +; SLM-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14 +; SLM-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14 +; SLM-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15 +; SLM-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15 ; SLM-NEXT: ret <16 x i16> [[V15]] ; ; AVX-LABEL: @loadext_16i8_to_16i16( @@ -695,23 +707,23 @@ define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) { ; SSE2-LABEL: @loadext_2i16_to_2i64( ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* -; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SSE2-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 +; SSE2-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 ; SSE2-NEXT: ret <2 x i64> [[V1]] ; ; SLM-LABEL: @loadext_2i16_to_2i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 ; SLM-NEXT: ret <2 x i64> [[V1]] ; ; AVX-LABEL: @loadext_2i16_to_2i64( @@ -757,18 +769,17 @@ ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 ; SLM-NEXT: ret <4 x i32> [[V3]] ; ; AVX-LABEL: @loadext_4i16_to_4i32( @@ -811,53 +822,89 @@ ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; SSE2-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 +; SSE2-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64 +; SSE2-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64 +; SSE2-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; SSE2-NEXT: ret <4 x i64> [[V3]] ; ; SLM-LABEL: @loadext_4i16_to_4i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64 -; SLM-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64 -; SLM-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 ; SLM-NEXT: ret <4 x i64> [[V3]] ; -; AVX-LABEL: @loadext_4i16_to_4i64( -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX1-LABEL: @loadext_4i16_to_4i64( +; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX1-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 +; AVX1-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 +; AVX1-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 +; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64 +; AVX1-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: ret <4 x i64> [[V3]] +; +; AVX2-LABEL: @loadext_4i16_to_4i64( +; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; AVX2-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX2-NEXT: ret <4 x i64> [[V3]] +; +; AVX512-LABEL: @loadext_4i16_to_4i64( +; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; AVX512-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX512-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -915,30 +962,25 @@ ; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 ; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 ; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i16, i16* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i16, i16* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i16, i16* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i16, i16* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i32 -; SLM-NEXT: [[X4:%.*]] = zext i16 [[I4]] to i32 -; SLM-NEXT: [[X5:%.*]] = zext i16 [[I5]] to i32 -; SLM-NEXT: [[X6:%.*]] = zext i16 [[I6]] to i32 -; SLM-NEXT: [[X7:%.*]] = zext i16 [[I7]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 +; SLM-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 +; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 +; SLM-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 +; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 +; SLM-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 +; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 +; SLM-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 +; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 ; SLM-NEXT: ret <8 x i32> [[V7]] ; ; AVX-LABEL: @loadext_8i16_to_8i32( @@ -1022,12 +1064,13 @@ ; ; SLM-LABEL: @loadext_2i32_to_2i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; SLM-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 ; SLM-NEXT: ret <2 x i64> [[V1]] ; ; AVX-LABEL: @loadext_2i32_to_2i64( @@ -1056,53 +1099,90 @@ ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; SSE2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 +; SSE2-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> +; SSE2-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64 +; SSE2-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64 +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 ; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 ; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; SSE2-NEXT: ret <4 x i64> [[V3]] ; ; SLM-LABEL: @loadext_4i32_to_4i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64 -; SLM-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64 -; SLM-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 ; SLM-NEXT: ret <4 x i64> [[V3]] ; -; AVX-LABEL: @loadext_4i32_to_4i64( -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 -; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX1-LABEL: @loadext_4i32_to_4i64( +; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; AVX1-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 +; AVX1-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 +; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64 +; AVX1-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: ret <4 x i64> [[V3]] +; +; AVX2-LABEL: @loadext_4i32_to_4i64( +; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; AVX2-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX2-NEXT: ret <4 x i64> [[V3]] +; +; AVX512-LABEL: @loadext_4i32_to_4i64( +; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* +; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; AVX512-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 +; AVX512-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i32, i32* %p0, i64 1 %p2 = getelementptr inbounds i32, i32* %p0, i64 2