diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1576,7 +1576,7 @@ /// Do we need to gather this sequence or vectorize it /// (either with vector instruction or with scatter/gather /// intrinsics for store/load)? - enum EntryState { Vectorize, ScatterVectorize, NeedToGather }; + enum EntryState { Vectorize, ScatterVectorize, NeedToGather, MayBeRemoved }; EntryState State; /// Does this sequence require some shuffling? @@ -1730,6 +1730,9 @@ case NeedToGather: dbgs() << "NeedToGather\n"; break; + case MayBeRemoved: + dbgs() << "MayBeRemoved\n"; + break; } dbgs() << "MainOp: "; if (MainOp) @@ -1799,8 +1802,9 @@ ArrayRef ReuseShuffleIndices = None, ArrayRef ReorderIndices = None) { assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || + (!Bundle && EntryState == TreeEntry::MayBeRemoved) || (Bundle && EntryState != TreeEntry::NeedToGather)) && - "Need to vectorize gather entry?"); + "Bundle and EntryState mismatch"); VectorizableTree.push_back(std::make_unique(VectorizableTree)); TreeEntry *Last = VectorizableTree.back().get(); Last->Idx = VectorizableTree.size() - 1; @@ -1810,7 +1814,8 @@ ReuseShuffleIndices.end()); Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); Last->setOperations(S); - if (Last->State != TreeEntry::NeedToGather) { + if (Last->State != TreeEntry::NeedToGather && + Last->State != TreeEntry::MayBeRemoved) { for (Value *V : VL) { assert(!getTreeEntry(V) && "Scalar already in tree!"); ScalarToTreeEntry[V] = Last; @@ -2486,6 +2491,8 @@ const BoUpSLP *) { if (Entry->State == TreeEntry::NeedToGather) return "color=red"; + if (Entry->State == TreeEntry::MayBeRemoved) + return "color=blue"; return ""; } }; @@ -2544,6 +2551,13 @@ if (Entry->State == TreeEntry::NeedToGather) continue; + if (Entry->State == TreeEntry::MayBeRemoved) { + assert(UserIgnoreList.size() == 0 && + "UserIgnoreList should be empty when roots are inserts"); + UserIgnoreList = Roots; + continue; + } + // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; @@ -2568,6 +2582,7 @@ if (!UserInst) continue; + // TODO: process TreeEntry::MayBeRemoved here // Skip in-tree scalars that become vectors if (TreeEntry *UseEntry = getTreeEntry(U)) { Value *UseScalar = UseEntry->Scalars[0]; @@ -2609,9 +2624,40 @@ // Don't handle vectors. if (S.OpValue->getType()->isVectorTy()) { - LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); - return; + if (isa(VL[0])) { + ValueList Operands; + for (Value *V : VL) + Operands.push_back(cast(V)->getOperand(1)); + + if (cast(VL[0]->getType())->getNumElements() == + VL.size()) { + bool FoundMainInsert = false; + for (auto *Insert : VL) + if (llvm::any_of(Insert->users(), [&](Value *U) { + return !llvm::any_of(VL, [&](Value *S) { return U == S; }); + })) { + if (FoundMainInsert) { // Main insert should be unique + FoundMainInsert = false; + break; + } + FoundMainInsert = true; + } + if (FoundMainInsert) { + LLVM_DEBUG(dbgs() << "SLP: Marking inserts as may be removed.\n"); + TreeEntry *TE = + newTreeEntry(VL, TreeEntry::MayBeRemoved, None, S, UserTreeIdx); + TE->setOperand(0, Operands); + buildTree_rec(Operands, Depth + 1, {TE, 0}); + return; + } + } + buildTree_rec(Operands, Depth, UserTreeIdx); + return; + } else { + LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + return; + } } if (StoreInst *SI = dyn_cast(S.OpValue)) @@ -3448,6 +3494,17 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { ArrayRef VL = E->Scalars; + if (E->State == TreeEntry::MayBeRemoved) { + InstructionCost Cost = 0; + for (auto *Scalar : E->Scalars) { + auto *IE = cast(Scalar); + if (auto *CI = dyn_cast(IE->getOperand(2))) + Cost -= TTI->getVectorInstrCost(Instruction::InsertElement, + IE->getType(), CI->getZExtValue()); + } + return Cost; + } + Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast(VL[0])) ScalarTy = SI->getValueOperand()->getType(); @@ -4434,6 +4491,19 @@ return Vec; } + if (E->State == TreeEntry::MayBeRemoved) { + vectorizeTree(E->getOperand(0)); + for (auto *Scalar : E->Scalars) + if (llvm::any_of(Scalar->users(), [&](Value *U) { + return !llvm::any_of(E->Scalars, [&](Value *S) { return U == S; }); + })) { + assert(!E->VectorizedValue && "VectorizedValue already found?"); + E->VectorizedValue = Scalar; + } + assert(E->VectorizedValue && "VectorizedValue not found?"); + return E->VectorizedValue; + } + assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && "Unhandled state"); @@ -5019,6 +5089,12 @@ if (Entry->State == TreeEntry::NeedToGather) continue; + if (Entry->State == TreeEntry::MayBeRemoved) { + Value *MainInsert = Entry->VectorizedValue; + Value *Vec = getTreeEntry(Entry->getOperand(0)[0])->VectorizedValue; + MainInsert->replaceAllUsesWith(Vec); + } + assert(Entry->VectorizedValue && "Can't find vectorizable value"); // For each lane: @@ -6213,7 +6289,7 @@ // determining vectorization factor for scalar instructions. for (Value *V : VL) { Type *Ty = V->getType(); - if (!isValidElementType(Ty)) { + if (!isValidElementType(Ty) && !isa(V)) { // NOTE: the following will give user internal llvm type name, which may // not be useful. R.getORE()->emit([&]() { @@ -6244,20 +6320,16 @@ bool Changed = false; bool CandidateFound = false; InstructionCost MinCost = SLPCostThreshold.getValue(); - - bool CompensateUseCost = - !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) { - return V && isa(V); - }); - assert((!CompensateUseCost || InsertUses.size() == VL.size()) && - "Each scalar expected to have an associated InsertElement user."); + Type *ScalarTy = VL[0]->getType(); + if (auto *VecTy = dyn_cast(ScalarTy)) + ScalarTy = VecTy->getElementType(); unsigned NextInst = 0, MaxInst = VL.size(); for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) { // No actual vectorization should happen, if number of parts is the same as // provided vectorization factor (i.e. the scalar type is used for vector // code during codegen). - auto *VecTy = FixedVectorType::get(VL[0]->getType(), VF); + auto *VecTy = FixedVectorType::get(ScalarTy, VF); if (TTI->getNumberOfParts(VecTy) == VF) continue; for (unsigned I = NextInst; I < MaxInst; ++I) { @@ -6301,46 +6373,6 @@ R.computeMinimumValueSizes(); InstructionCost Cost = R.getTreeCost(); CandidateFound = true; - if (CompensateUseCost) { - // TODO: Use TTI's getScalarizationOverhead for sequence of inserts - // rather than sum of single inserts as the latter may overestimate - // cost. This work should imply improving cost estimation for extracts - // that added in for external (for vectorization tree) users,i.e. that - // part should also switch to same interface. - // For example, the following case is projected code after SLP: - // %4 = extractelement <4 x i64> %3, i32 0 - // %v0 = insertelement <4 x i64> poison, i64 %4, i32 0 - // %5 = extractelement <4 x i64> %3, i32 1 - // %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1 - // %6 = extractelement <4 x i64> %3, i32 2 - // %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2 - // %7 = extractelement <4 x i64> %3, i32 3 - // %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3 - // - // Extracts here added by SLP in order to feed users (the inserts) of - // original scalars and contribute to "ExtractCost" at cost evaluation. - // The inserts in turn form sequence to build an aggregate that - // detected by findBuildAggregate routine. - // SLP makes an assumption that such sequence will be optimized away - // later (instcombine) so it tries to compensate ExctractCost with - // cost of insert sequence. - // Current per element cost calculation approach is not quite accurate - // and tends to create bias toward favoring vectorization. - // Switching to the TTI interface might help a bit. - // Alternative solution could be pattern-match to detect a no-op or - // shuffle. - InstructionCost UserCost = 0; - for (unsigned Lane = 0; Lane < OpsWidth; Lane++) { - auto *IE = cast(InsertUses[I + Lane]); - if (auto *CI = dyn_cast(IE->getOperand(2))) - UserCost += TTI->getVectorInstrCost( - Instruction::InsertElement, IE->getType(), CI->getZExtValue()); - } - LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost - << ".\n"); - Cost -= UserCost; - } - MinCost = std::min(MinCost, Cost); if (Cost < -SLPCostThreshold) { @@ -7458,10 +7490,8 @@ isShuffle(BuildVectorOpds))) return false; - // Vectorize starting with the build vector operands ignoring the BuildVector - // instructions for the purpose of scheduling and user extraction. - return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false, - BuildVectorInsts); + LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n"); + return tryToVectorizeList(BuildVectorInsts, R, /*AllowReorder=*/false); } bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll @@ -13,15 +13,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @int_sin_4x( ; NOACCELERATE-NEXT: entry: @@ -64,29 +56,13 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @ceil_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]]) -; NOACCELERATE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -112,29 +88,13 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @fabs_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]]) -; NOACCELERATE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -158,29 +118,13 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @int_fabs_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]]) -; NOACCELERATE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -204,29 +148,13 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @floor_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]]) -; NOACCELERATE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -250,29 +178,13 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @sqrt_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]]) -; NOACCELERATE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -296,15 +208,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vexpf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @exp_4x( ; NOACCELERATE-NEXT: entry: @@ -345,15 +249,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vexpm1f(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @expm1_4x( ; NOACCELERATE-NEXT: entry: @@ -394,15 +290,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vlogf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @log_4x( ; NOACCELERATE-NEXT: entry: @@ -443,15 +331,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vlog1pf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @log1p_4x( ; NOACCELERATE-NEXT: entry: @@ -544,15 +424,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vlogbf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @logb_4x( ; NOACCELERATE-NEXT: entry: @@ -593,15 +465,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @sin_4x( ; NOACCELERATE-NEXT: entry: @@ -642,15 +506,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @cos_4x( ; NOACCELERATE-NEXT: entry: @@ -691,15 +547,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vtanf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @tan_4x( ; NOACCELERATE-NEXT: entry: @@ -740,15 +588,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @asin_4x( ; NOACCELERATE-NEXT: entry: @@ -789,15 +629,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @acos_4x( ; NOACCELERATE-NEXT: entry: @@ -838,15 +670,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @atan_4x( ; NOACCELERATE-NEXT: entry: @@ -887,15 +711,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @sinh_4x( ; NOACCELERATE-NEXT: entry: @@ -936,15 +752,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @cosh_4x( ; NOACCELERATE-NEXT: entry: @@ -985,15 +793,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @tanh_4x( ; NOACCELERATE-NEXT: entry: @@ -1034,15 +834,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vasinhf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @asinh_4x( ; NOACCELERATE-NEXT: entry: @@ -1083,15 +875,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vacoshf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @acosh_4x( ; NOACCELERATE-NEXT: entry: @@ -1132,15 +916,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatanhf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @atanh_4x( ; NOACCELERATE-NEXT: entry: @@ -1220,15 +996,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @int_cos_4x( ; NOACCELERATE-NEXT: entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll @@ -13,15 +13,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @int_sin_4x( ; NOACCELERATE-NEXT: entry: @@ -64,29 +56,13 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @ceil_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]]) -; NOACCELERATE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -112,29 +88,13 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @fabs_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]]) -; NOACCELERATE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -158,29 +118,13 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @int_fabs_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]]) -; NOACCELERATE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -204,29 +148,13 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @floor_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]]) -; NOACCELERATE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -250,29 +178,13 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @sqrt_4x( ; NOACCELERATE-NEXT: entry: ; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; NOACCELERATE-NEXT: [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]]) -; NOACCELERATE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; NOACCELERATE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: ret <4 x float> [[TMP1]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -296,15 +208,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vexpf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @exp_4x( ; NOACCELERATE-NEXT: entry: @@ -345,15 +249,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vexpm1f(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @expm1_4x( ; NOACCELERATE-NEXT: entry: @@ -394,15 +290,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vlogf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @log_4x( ; NOACCELERATE-NEXT: entry: @@ -443,15 +331,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vlog1pf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @log1p_4x( ; NOACCELERATE-NEXT: entry: @@ -544,15 +424,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vlogbf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @logb_4x( ; NOACCELERATE-NEXT: entry: @@ -593,15 +465,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @sin_4x( ; NOACCELERATE-NEXT: entry: @@ -642,15 +506,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @cos_4x( ; NOACCELERATE-NEXT: entry: @@ -691,15 +547,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vtanf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @tan_4x( ; NOACCELERATE-NEXT: entry: @@ -740,15 +588,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @asin_4x( ; NOACCELERATE-NEXT: entry: @@ -789,15 +629,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @acos_4x( ; NOACCELERATE-NEXT: entry: @@ -838,15 +670,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @atan_4x( ; NOACCELERATE-NEXT: entry: @@ -887,15 +711,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @sinh_4x( ; NOACCELERATE-NEXT: entry: @@ -936,15 +752,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @cosh_4x( ; NOACCELERATE-NEXT: entry: @@ -985,15 +793,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @tanh_4x( ; NOACCELERATE-NEXT: entry: @@ -1034,15 +834,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vasinhf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @asinh_4x( ; NOACCELERATE-NEXT: entry: @@ -1083,15 +875,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vacoshf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @acosh_4x( ; NOACCELERATE-NEXT: entry: @@ -1132,15 +916,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatanhf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @atanh_4x( ; NOACCELERATE-NEXT: entry: @@ -1220,15 +996,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; ; NOACCELERATE-LABEL: @int_cos_4x( ; NOACCELERATE-NEXT: entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -29,40 +29,24 @@ ; GATHER: for.body: ; GATHER-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; GATHER-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7 -; GATHER-NEXT: [[TMP3:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 -; GATHER-NEXT: [[TMP4:%.*]] = insertelement <8 x i1> poison, i1 [[TMP3]], i32 0 -; GATHER-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1 -; GATHER-NEXT: [[TMP6:%.*]] = insertelement <8 x i1> [[TMP4]], i1 [[TMP5]], i32 1 -; GATHER-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2 -; GATHER-NEXT: [[TMP8:%.*]] = insertelement <8 x i1> [[TMP6]], i1 [[TMP7]], i32 2 -; GATHER-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3 -; GATHER-NEXT: [[TMP10:%.*]] = insertelement <8 x i1> [[TMP8]], i1 [[TMP9]], i32 3 -; GATHER-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4 -; GATHER-NEXT: [[TMP12:%.*]] = insertelement <8 x i1> [[TMP10]], i1 [[TMP11]], i32 4 -; GATHER-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5 -; GATHER-NEXT: [[TMP14:%.*]] = insertelement <8 x i1> [[TMP12]], i1 [[TMP13]], i32 5 -; GATHER-NEXT: [[TMP15:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6 -; GATHER-NEXT: [[TMP16:%.*]] = insertelement <8 x i1> [[TMP14]], i1 [[TMP15]], i32 6 -; GATHER-NEXT: [[TMP17:%.*]] = insertelement <8 x i1> [[TMP16]], i1 [[TMP2]], i32 7 -; GATHER-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> , <8 x i32> -; GATHER-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP18]], i32 0 -; GATHER-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP18]], i32 1 -; GATHER-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP18]], i32 2 -; GATHER-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP18]], i32 3 -; GATHER-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP18]], i32 4 -; GATHER-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP18]], i32 5 -; GATHER-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP18]], i32 6 -; GATHER-NEXT: [[TMP26:%.*]] = insertelement <8 x i32> poison, i32 [[TMP19]], i32 0 -; GATHER-NEXT: [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP20]], i32 1 -; GATHER-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP21]], i32 2 -; GATHER-NEXT: [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP22]], i32 3 -; GATHER-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP23]], i32 4 -; GATHER-NEXT: [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP24]], i32 5 -; GATHER-NEXT: [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6 -; GATHER-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7 -; GATHER-NEXT: [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7 -; GATHER-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP34]]) -; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP35]], [[P17]] +; GATHER-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> , <8 x i32> +; GATHER-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6 +; GATHER-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5 +; GATHER-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4 +; GATHER-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3 +; GATHER-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2 +; GATHER-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1 +; GATHER-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 +; GATHER-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 +; GATHER-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 +; GATHER-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 +; GATHER-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 +; GATHER-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 +; GATHER-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 +; GATHER-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 +; GATHER-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]]) +; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP18]], [[P17]] +; GATHER-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 ; GATHER-NEXT: br label [[FOR_BODY]] ; ; MAX-COST-LABEL: @PR28330( @@ -165,40 +149,24 @@ ; GATHER: for.body: ; GATHER-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; GATHER-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7 -; GATHER-NEXT: [[TMP3:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 -; GATHER-NEXT: [[TMP4:%.*]] = insertelement <8 x i1> poison, i1 [[TMP3]], i32 0 -; GATHER-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1 -; GATHER-NEXT: [[TMP6:%.*]] = insertelement <8 x i1> [[TMP4]], i1 [[TMP5]], i32 1 -; GATHER-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2 -; GATHER-NEXT: [[TMP8:%.*]] = insertelement <8 x i1> [[TMP6]], i1 [[TMP7]], i32 2 -; GATHER-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3 -; GATHER-NEXT: [[TMP10:%.*]] = insertelement <8 x i1> [[TMP8]], i1 [[TMP9]], i32 3 -; GATHER-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4 -; GATHER-NEXT: [[TMP12:%.*]] = insertelement <8 x i1> [[TMP10]], i1 [[TMP11]], i32 4 -; GATHER-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5 -; GATHER-NEXT: [[TMP14:%.*]] = insertelement <8 x i1> [[TMP12]], i1 [[TMP13]], i32 5 -; GATHER-NEXT: [[TMP15:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6 -; GATHER-NEXT: [[TMP16:%.*]] = insertelement <8 x i1> [[TMP14]], i1 [[TMP15]], i32 6 -; GATHER-NEXT: [[TMP17:%.*]] = insertelement <8 x i1> [[TMP16]], i1 [[TMP2]], i32 7 -; GATHER-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> , <8 x i32> -; GATHER-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP18]], i32 0 -; GATHER-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP18]], i32 1 -; GATHER-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP18]], i32 2 -; GATHER-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP18]], i32 3 -; GATHER-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP18]], i32 4 -; GATHER-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP18]], i32 5 -; GATHER-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP18]], i32 6 -; GATHER-NEXT: [[TMP26:%.*]] = insertelement <8 x i32> poison, i32 [[TMP19]], i32 0 -; GATHER-NEXT: [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP20]], i32 1 -; GATHER-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP21]], i32 2 -; GATHER-NEXT: [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP22]], i32 3 -; GATHER-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP23]], i32 4 -; GATHER-NEXT: [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP24]], i32 5 -; GATHER-NEXT: [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6 -; GATHER-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7 -; GATHER-NEXT: [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7 -; GATHER-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP34]]) -; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP35]], -5 +; GATHER-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> , <8 x i32> +; GATHER-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6 +; GATHER-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5 +; GATHER-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4 +; GATHER-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3 +; GATHER-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2 +; GATHER-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1 +; GATHER-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 +; GATHER-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 +; GATHER-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 +; GATHER-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 +; GATHER-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 +; GATHER-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 +; GATHER-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 +; GATHER-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 +; GATHER-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]]) +; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP18]], -5 +; GATHER-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 ; GATHER-NEXT: br label [[FOR_BODY]] ; ; MAX-COST-LABEL: @PR32038( diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement-inseltpoison.ll @@ -10,11 +10,7 @@ define <2 x float> @insertelement-fixed-vector() { ; CHECK-LABEL: @insertelement-fixed-vector( ; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x float> @llvm.fabs.v2f32(<2 x float> undef) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[I0:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[I1:%.*]] = insertelement <2 x float> [[I0]], float [[TMP3]], i32 1 -; CHECK-NEXT: ret <2 x float> [[I1]] +; CHECK-NEXT: ret <2 x float> [[TMP1]] ; %f0 = tail call fast float @llvm.fabs.f32(float undef) %f1 = tail call fast float @llvm.fabs.f32(float undef) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll @@ -10,11 +10,7 @@ define <2 x float> @insertelement-fixed-vector() { ; CHECK-LABEL: @insertelement-fixed-vector( ; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x float> @llvm.fabs.v2f32(<2 x float> undef) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[I0:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[I1:%.*]] = insertelement <2 x float> [[I0]], float [[TMP3]], i32 1 -; CHECK-NEXT: ret <2 x float> [[I1]] +; CHECK-NEXT: ret <2 x float> [[TMP1]] ; %f0 = tail call fast float @llvm.fabs.f32(float undef) %f1 = tail call fast float @llvm.fabs.f32(float undef) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -123,8 +123,8 @@ ; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 %v0.1 = extractelement <2 x i32> %v0, i32 1 @@ -190,19 +190,20 @@ ; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 ; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] ; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]] ; CHECK-NEXT: [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP1_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP1_3:%.*]] = xor i32 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] ; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP2_2:%.*]] = add i32 [[TMP0_2]], [[TMP0_3]] +; CHECK-NEXT: [[TMP2_3:%.*]] = add i32 [[TMP1_2]], [[TMP1_3]] ; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2_0]], i32 0 ; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 -; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP3_2:%.*]] = insertelement <4 x i32> [[TMP3_1]], i32 [[TMP2_2]], i32 2 +; CHECK-NEXT: [[TMP3_3:%.*]] = insertelement <4 x i32> [[TMP3_2]], i32 [[TMP2_3]], i32 3 ; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -123,8 +123,8 @@ ; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 %v0.1 = extractelement <2 x i32> %v0, i32 1 @@ -190,19 +190,20 @@ ; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 ; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] ; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]] ; CHECK-NEXT: [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP1_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP1_3:%.*]] = xor i32 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] ; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP2_2:%.*]] = add i32 [[TMP0_2]], [[TMP0_3]] +; CHECK-NEXT: [[TMP2_3:%.*]] = add i32 [[TMP1_2]], [[TMP1_3]] ; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0 ; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 -; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP3_2:%.*]] = insertelement <4 x i32> [[TMP3_1]], i32 [[TMP2_2]], i32 2 +; CHECK-NEXT: [[TMP3_3:%.*]] = insertelement <4 x i32> [[TMP3_2]], i32 [[TMP2_3]], i32 3 ; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll @@ -4,22 +4,17 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { -; GFX7-LABEL: @uadd_sat_v2i16( -; GFX7-NEXT: bb: -; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0 -; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1 -; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0 -; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 -; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) -; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 -; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 -; GFX7-NEXT: ret <2 x i16> [[INS_1]] -; -; GFX8-LABEL: @uadd_sat_v2i16( -; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]]) -; GFX8-NEXT: ret <2 x i16> [[TMP0]] +; GCN-LABEL: @uadd_sat_v2i16( +; GCN-NEXT: bb: +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i16> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i16> %arg0, i64 0 @@ -34,22 +29,17 @@ } define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { -; GFX7-LABEL: @usub_sat_v2i16( -; GFX7-NEXT: bb: -; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0 -; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1 -; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0 -; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 -; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) -; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 -; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 -; GFX7-NEXT: ret <2 x i16> [[INS_1]] -; -; GFX8-LABEL: @usub_sat_v2i16( -; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]]) -; GFX8-NEXT: ret <2 x i16> [[TMP0]] +; GCN-LABEL: @usub_sat_v2i16( +; GCN-NEXT: bb: +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i16> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i16> %arg0, i64 0 @@ -64,22 +54,17 @@ } define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { -; GFX7-LABEL: @sadd_sat_v2i16( -; GFX7-NEXT: bb: -; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0 -; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1 -; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0 -; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 -; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) -; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 -; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 -; GFX7-NEXT: ret <2 x i16> [[INS_1]] -; -; GFX8-LABEL: @sadd_sat_v2i16( -; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]]) -; GFX8-NEXT: ret <2 x i16> [[TMP0]] +; GCN-LABEL: @sadd_sat_v2i16( +; GCN-NEXT: bb: +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i16> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i16> %arg0, i64 0 @@ -94,22 +79,17 @@ } define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { -; GFX7-LABEL: @ssub_sat_v2i16( -; GFX7-NEXT: bb: -; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0 -; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1 -; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0 -; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 -; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) -; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 -; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 -; GFX7-NEXT: ret <2 x i16> [[INS_1]] -; -; GFX8-LABEL: @ssub_sat_v2i16( -; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]]) -; GFX8-NEXT: ret <2 x i16> [[TMP0]] +; GCN-LABEL: @ssub_sat_v2i16( +; GCN-NEXT: bb: +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i16> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i16> %arg0, i64 0 @@ -224,36 +204,21 @@ } define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) { -; GFX7-LABEL: @uadd_sat_v3i16( -; GFX7-NEXT: bb: -; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 0 -; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <3 x i16> [[ARG0]], i64 1 -; GFX7-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0]], i64 2 -; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 0 -; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <3 x i16> [[ARG1]], i64 1 -; GFX7-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1]], i64 2 -; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) -; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[ADD_0]], i64 0 -; GFX7-NEXT: [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 -; GFX7-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 -; GFX7-NEXT: ret <3 x i16> [[INS_2]] -; -; GFX8-LABEL: @uadd_sat_v3i16( -; GFX8-NEXT: bb: -; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 -; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) -; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX8-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0 -; GFX8-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[TMP3]], i64 0 -; GFX8-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1 -; GFX8-NEXT: [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[TMP4]], i64 1 -; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 -; GFX8-NEXT: ret <3 x i16> [[INS_2]] +; GCN-LABEL: @uadd_sat_v3i16( +; GCN-NEXT: bb: +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <3 x i16> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0]], i64 2 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <3 x i16> [[ARG1]], i64 1 +; GCN-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1]], i64 2 +; GCN-NEXT: [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) +; GCN-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 +; GCN-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 +; GCN-NEXT: ret <3 x i16> [[INS_2]] ; bb: %arg0.0 = extractelement <3 x i16> %arg0, i64 0 @@ -272,36 +237,25 @@ } define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) { -; GFX7-LABEL: @uadd_sat_v4i16( -; GFX7-NEXT: bb: -; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 0 -; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <4 x i16> [[ARG0]], i64 1 -; GFX7-NEXT: [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0]], i64 2 -; GFX7-NEXT: [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3 -; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 0 -; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <4 x i16> [[ARG1]], i64 1 -; GFX7-NEXT: [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1]], i64 2 -; GFX7-NEXT: [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3 -; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) -; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX7-NEXT: [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0 -; GFX7-NEXT: [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 -; GFX7-NEXT: [[INS_2:%.*]] = insertelement <4 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 -; GFX7-NEXT: [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3 -; GFX7-NEXT: ret <4 x i16> [[INS_3]] -; -; GFX8-LABEL: @uadd_sat_v4i16( -; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) -; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]]) -; GFX8-NEXT: [[INS_3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> -; GFX8-NEXT: ret <4 x i16> [[INS_3]] +; GCN-LABEL: @uadd_sat_v4i16( +; GCN-NEXT: bb: +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <4 x i16> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0]], i64 2 +; GCN-NEXT: [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <4 x i16> [[ARG1]], i64 1 +; GCN-NEXT: [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1]], i64 2 +; GCN-NEXT: [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3 +; GCN-NEXT: [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) +; GCN-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) +; GCN-NEXT: [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 +; GCN-NEXT: [[INS_2:%.*]] = insertelement <4 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 +; GCN-NEXT: [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3 +; GCN-NEXT: ret <4 x i16> [[INS_3]] ; bb: %arg0.0 = extractelement <4 x i16> %arg0, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll @@ -4,22 +4,17 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { -; GFX7-LABEL: @uadd_sat_v2i16( -; GFX7-NEXT: bb: -; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0 -; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1 -; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0 -; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 -; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) -; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 -; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 -; GFX7-NEXT: ret <2 x i16> [[INS_1]] -; -; GFX8-LABEL: @uadd_sat_v2i16( -; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]]) -; GFX8-NEXT: ret <2 x i16> [[TMP0]] +; GCN-LABEL: @uadd_sat_v2i16( +; GCN-NEXT: bb: +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i16> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i16> %arg0, i64 0 @@ -34,22 +29,17 @@ } define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { -; GFX7-LABEL: @usub_sat_v2i16( -; GFX7-NEXT: bb: -; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0 -; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1 -; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0 -; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 -; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) -; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 -; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 -; GFX7-NEXT: ret <2 x i16> [[INS_1]] -; -; GFX8-LABEL: @usub_sat_v2i16( -; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]]) -; GFX8-NEXT: ret <2 x i16> [[TMP0]] +; GCN-LABEL: @usub_sat_v2i16( +; GCN-NEXT: bb: +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i16> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i16> %arg0, i64 0 @@ -64,22 +54,17 @@ } define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { -; GFX7-LABEL: @sadd_sat_v2i16( -; GFX7-NEXT: bb: -; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0 -; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1 -; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0 -; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 -; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) -; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 -; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 -; GFX7-NEXT: ret <2 x i16> [[INS_1]] -; -; GFX8-LABEL: @sadd_sat_v2i16( -; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]]) -; GFX8-NEXT: ret <2 x i16> [[TMP0]] +; GCN-LABEL: @sadd_sat_v2i16( +; GCN-NEXT: bb: +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i16> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i16> %arg0, i64 0 @@ -94,22 +79,17 @@ } define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) { -; GFX7-LABEL: @ssub_sat_v2i16( -; GFX7-NEXT: bb: -; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0 -; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1 -; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0 -; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 -; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) -; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 -; GFX7-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 -; GFX7-NEXT: ret <2 x i16> [[INS_1]] -; -; GFX8-LABEL: @ssub_sat_v2i16( -; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]]) -; GFX8-NEXT: ret <2 x i16> [[TMP0]] +; GCN-LABEL: @ssub_sat_v2i16( +; GCN-NEXT: bb: +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1 +; GCN-NEXT: [[ADD_0:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 +; GCN-NEXT: ret <2 x i16> [[INS_1]] ; bb: %arg0.0 = extractelement <2 x i16> %arg0, i64 0 @@ -224,36 +204,21 @@ } define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) { -; GFX7-LABEL: @uadd_sat_v3i16( -; GFX7-NEXT: bb: -; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 0 -; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <3 x i16> [[ARG0]], i64 1 -; GFX7-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0]], i64 2 -; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 0 -; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <3 x i16> [[ARG1]], i64 1 -; GFX7-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1]], i64 2 -; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) -; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[ADD_0]], i64 0 -; GFX7-NEXT: [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 -; GFX7-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 -; GFX7-NEXT: ret <3 x i16> [[INS_2]] -; -; GFX8-LABEL: @uadd_sat_v3i16( -; GFX8-NEXT: bb: -; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 -; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) -; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX8-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0 -; GFX8-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[TMP3]], i64 0 -; GFX8-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1 -; GFX8-NEXT: [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[TMP4]], i64 1 -; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 -; GFX8-NEXT: ret <3 x i16> [[INS_2]] +; GCN-LABEL: @uadd_sat_v3i16( +; GCN-NEXT: bb: +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <3 x i16> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0]], i64 2 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <3 x i16> [[ARG1]], i64 1 +; GCN-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1]], i64 2 +; GCN-NEXT: [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) +; GCN-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 +; GCN-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 +; GCN-NEXT: ret <3 x i16> [[INS_2]] ; bb: %arg0.0 = extractelement <3 x i16> %arg0, i64 0 @@ -272,36 +237,25 @@ } define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) { -; GFX7-LABEL: @uadd_sat_v4i16( -; GFX7-NEXT: bb: -; GFX7-NEXT: [[ARG0_0:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 0 -; GFX7-NEXT: [[ARG0_1:%.*]] = extractelement <4 x i16> [[ARG0]], i64 1 -; GFX7-NEXT: [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0]], i64 2 -; GFX7-NEXT: [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3 -; GFX7-NEXT: [[ARG1_0:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 0 -; GFX7-NEXT: [[ARG1_1:%.*]] = extractelement <4 x i16> [[ARG1]], i64 1 -; GFX7-NEXT: [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1]], i64 2 -; GFX7-NEXT: [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3 -; GFX7-NEXT: [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) -; GFX7-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) -; GFX7-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX7-NEXT: [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]]) -; GFX7-NEXT: [[INS_0:%.*]] = insertelement <4 x i16> undef, i16 [[ADD_0]], i64 0 -; GFX7-NEXT: [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 -; GFX7-NEXT: [[INS_2:%.*]] = insertelement <4 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 -; GFX7-NEXT: [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3 -; GFX7-NEXT: ret <4 x i16> [[INS_3]] -; -; GFX8-LABEL: @uadd_sat_v4i16( -; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) -; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]]) -; GFX8-NEXT: [[INS_3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> -; GFX8-NEXT: ret <4 x i16> [[INS_3]] +; GCN-LABEL: @uadd_sat_v4i16( +; GCN-NEXT: bb: +; GCN-NEXT: [[ARG0_0:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 0 +; GCN-NEXT: [[ARG0_1:%.*]] = extractelement <4 x i16> [[ARG0]], i64 1 +; GCN-NEXT: [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0]], i64 2 +; GCN-NEXT: [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3 +; GCN-NEXT: [[ARG1_0:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 0 +; GCN-NEXT: [[ARG1_1:%.*]] = extractelement <4 x i16> [[ARG1]], i64 1 +; GCN-NEXT: [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1]], i64 2 +; GCN-NEXT: [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3 +; GCN-NEXT: [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]]) +; GCN-NEXT: [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]]) +; GCN-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) +; GCN-NEXT: [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]]) +; GCN-NEXT: [[INS_0:%.*]] = insertelement <4 x i16> undef, i16 [[ADD_0]], i64 0 +; GCN-NEXT: [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1 +; GCN-NEXT: [[INS_2:%.*]] = insertelement <4 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 +; GCN-NEXT: [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3 +; GCN-NEXT: ret <4 x i16> [[INS_3]] ; bb: %arg0.0 = extractelement <4 x i16> %arg0, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap-inseltpoison.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s @@ -8,6 +9,16 @@ ; GFX8: call <2 x i16> @llvm.bswap.v2i16( define <2 x i16> @bswap_v2i16(<2 x i16> %arg) { +; GCN-LABEL: @bswap_v2i16( +; GCN-NEXT: bb: +; GCN-NEXT: [[TMP:%.*]] = extractelement <2 x i16> [[ARG:%.*]], i64 0 +; GCN-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP]]) +; GCN-NEXT: [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0 +; GCN-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[ARG]], i64 1 +; GCN-NEXT: [[TMP4:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; GCN-NEXT: [[TMP5:%.*]] = insertelement <2 x i16> [[TMP2]], i16 [[TMP4]], i64 1 +; GCN-NEXT: ret <2 x i16> [[TMP5]] +; bb: %tmp = extractelement <2 x i16> %arg, i64 0 %tmp1 = tail call i16 @llvm.bswap.i16(i16 %tmp) @@ -22,6 +33,16 @@ ; GCN: call i32 @llvm.bswap.i32 ; GCN: call i32 @llvm.bswap.i32 define <2 x i32> @bswap_v2i32(<2 x i32> %arg) { +; GCN-LABEL: @bswap_v2i32( +; GCN-NEXT: bb: +; GCN-NEXT: [[TMP:%.*]] = extractelement <2 x i32> [[ARG:%.*]], i64 0 +; GCN-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP]]) +; GCN-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i64 0 +; GCN-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[ARG]], i64 1 +; GCN-NEXT: [[TMP4:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; GCN-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i64 1 +; GCN-NEXT: ret <2 x i32> [[TMP5]] +; bb: %tmp = extractelement <2 x i32> %arg, i64 0 %tmp1 = tail call i32 @llvm.bswap.i32(i32 %tmp) diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s @@ -8,6 +9,16 @@ ; GFX8: call <2 x i16> @llvm.bswap.v2i16( define <2 x i16> @bswap_v2i16(<2 x i16> %arg) { +; GCN-LABEL: @bswap_v2i16( +; GCN-NEXT: bb: +; GCN-NEXT: [[TMP:%.*]] = extractelement <2 x i16> [[ARG:%.*]], i64 0 +; GCN-NEXT: [[TMP1:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP]]) +; GCN-NEXT: [[TMP2:%.*]] = insertelement <2 x i16> undef, i16 [[TMP1]], i64 0 +; GCN-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[ARG]], i64 1 +; GCN-NEXT: [[TMP4:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; GCN-NEXT: [[TMP5:%.*]] = insertelement <2 x i16> [[TMP2]], i16 [[TMP4]], i64 1 +; GCN-NEXT: ret <2 x i16> [[TMP5]] +; bb: %tmp = extractelement <2 x i16> %arg, i64 0 %tmp1 = tail call i16 @llvm.bswap.i16(i16 %tmp) @@ -22,6 +33,16 @@ ; GCN: call i32 @llvm.bswap.i32 ; GCN: call i32 @llvm.bswap.i32 define <2 x i32> @bswap_v2i32(<2 x i32> %arg) { +; GCN-LABEL: @bswap_v2i32( +; GCN-NEXT: bb: +; GCN-NEXT: [[TMP:%.*]] = extractelement <2 x i32> [[ARG:%.*]], i64 0 +; GCN-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP]]) +; GCN-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> undef, i32 [[TMP1]], i64 0 +; GCN-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[ARG]], i64 1 +; GCN-NEXT: [[TMP4:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; GCN-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i64 1 +; GCN-NEXT: ret <2 x i32> [[TMP5]] +; bb: %tmp = extractelement <2 x i32> %arg, i64 0 %tmp1 = tail call i32 @llvm.bswap.i32(i32 %tmp) diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round-inseltpoison.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s @@ -8,6 +9,16 @@ ; GFX8: call <2 x half> @llvm.round.v2f16( define <2 x half> @round_v2f16(<2 x half> %arg) { +; GCN-LABEL: @round_v2f16( +; GCN-NEXT: bb: +; GCN-NEXT: [[TMP:%.*]] = extractelement <2 x half> [[ARG:%.*]], i64 0 +; GCN-NEXT: [[TMP1:%.*]] = tail call half @llvm.round.f16(half [[TMP]]) +; GCN-NEXT: [[TMP2:%.*]] = insertelement <2 x half> poison, half [[TMP1]], i64 0 +; GCN-NEXT: [[TMP3:%.*]] = extractelement <2 x half> [[ARG]], i64 1 +; GCN-NEXT: [[TMP4:%.*]] = tail call half @llvm.round.f16(half [[TMP3]]) +; GCN-NEXT: [[TMP5:%.*]] = insertelement <2 x half> [[TMP2]], half [[TMP4]], i64 1 +; GCN-NEXT: ret <2 x half> [[TMP5]] +; bb: %tmp = extractelement <2 x half> %arg, i64 0 %tmp1 = tail call half @llvm.round.half(half %tmp) @@ -22,6 +33,16 @@ ; GCN: call float @llvm.round.f32( ; GCN: call float @llvm.round.f32( define <2 x float> @round_v2f32(<2 x float> %arg) { +; GCN-LABEL: @round_v2f32( +; GCN-NEXT: bb: +; GCN-NEXT: [[TMP:%.*]] = extractelement <2 x float> [[ARG:%.*]], i64 0 +; GCN-NEXT: [[TMP1:%.*]] = tail call float @llvm.round.f32(float [[TMP]]) +; GCN-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i64 0 +; GCN-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[ARG]], i64 1 +; GCN-NEXT: [[TMP4:%.*]] = tail call float @llvm.round.f32(float [[TMP3]]) +; GCN-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i64 1 +; GCN-NEXT: ret <2 x float> [[TMP5]] +; bb: %tmp = extractelement <2 x float> %arg, i64 0 %tmp1 = tail call float @llvm.round.f32(float %tmp) diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s @@ -8,6 +9,16 @@ ; GFX8: call <2 x half> @llvm.round.v2f16( define <2 x half> @round_v2f16(<2 x half> %arg) { +; GCN-LABEL: @round_v2f16( +; GCN-NEXT: bb: +; GCN-NEXT: [[TMP:%.*]] = extractelement <2 x half> [[ARG:%.*]], i64 0 +; GCN-NEXT: [[TMP1:%.*]] = tail call half @llvm.round.f16(half [[TMP]]) +; GCN-NEXT: [[TMP2:%.*]] = insertelement <2 x half> undef, half [[TMP1]], i64 0 +; GCN-NEXT: [[TMP3:%.*]] = extractelement <2 x half> [[ARG]], i64 1 +; GCN-NEXT: [[TMP4:%.*]] = tail call half @llvm.round.f16(half [[TMP3]]) +; GCN-NEXT: [[TMP5:%.*]] = insertelement <2 x half> [[TMP2]], half [[TMP4]], i64 1 +; GCN-NEXT: ret <2 x half> [[TMP5]] +; bb: %tmp = extractelement <2 x half> %arg, i64 0 %tmp1 = tail call half @llvm.round.half(half %tmp) @@ -22,6 +33,16 @@ ; GCN: call float @llvm.round.f32( ; GCN: call float @llvm.round.f32( define <2 x float> @round_v2f32(<2 x float> %arg) { +; GCN-LABEL: @round_v2f32( +; GCN-NEXT: bb: +; GCN-NEXT: [[TMP:%.*]] = extractelement <2 x float> [[ARG:%.*]], i64 0 +; GCN-NEXT: [[TMP1:%.*]] = tail call float @llvm.round.f32(float [[TMP]]) +; GCN-NEXT: [[TMP2:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i64 0 +; GCN-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[ARG]], i64 1 +; GCN-NEXT: [[TMP4:%.*]] = tail call float @llvm.round.f32(float [[TMP3]]) +; GCN-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i64 1 +; GCN-NEXT: ret <2 x float> [[TMP5]] +; bb: %tmp = extractelement <2 x float> %arg, i64 0 %tmp1 = tail call float @llvm.round.f32(float %tmp) diff --git a/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert-inseltpoison.ll @@ -4,15 +4,7 @@ define <4 x i32> @PR13837(<4 x float> %in) { ; CHECK-LABEL: @PR13837( ; CHECK-NEXT: [[TMP1:%.*]] = fptosi <4 x float> [[IN:%.*]] to <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 -; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 -; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x i32> [[V3]] +; CHECK-NEXT: ret <4 x i32> [[TMP1]] ; %t0 = extractelement <4 x float> %in, i64 0 %t1 = extractelement <4 x float> %in, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert.ll b/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert.ll --- a/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert.ll +++ b/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert.ll @@ -4,15 +4,7 @@ define <4 x i32> @PR13837(<4 x float> %in) { ; CHECK-LABEL: @PR13837( ; CHECK-NEXT: [[TMP1:%.*]] = fptosi <4 x float> [[IN:%.*]] to <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 -; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 -; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x i32> [[V3]] +; CHECK-NEXT: ret <4 x i32> [[TMP1]] ; %t0 = extractelement <4 x float> %in, i64 0 %t1 = extractelement <4 x float> %in, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll @@ -5,15 +5,13 @@ ; CHECK-LABEL: @_Z10fooConvertPDv4_xS0_S0_PKS_( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x half> undef, i32 4 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <16 x half> undef, i32 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x half> poison, half [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fpext <2 x half> [[TMP3]] to <2 x float> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 -; CHECK-NEXT: [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 -; CHECK-NEXT: [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP7]], i32 5 +; CHECK-NEXT: [[CONV_I_4_I:%.*]] = fpext half [[TMP0]] to float +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[CONV_I_4_I]] to i32 +; CHECK-NEXT: [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i32 4 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x half> undef, i32 5 +; CHECK-NEXT: [[CONV_I_5_I:%.*]] = fpext half [[TMP2]] to float +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[CONV_I_5_I]] to i32 +; CHECK-NEXT: [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP3]], i32 5 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll @@ -5,15 +5,13 @@ ; CHECK-LABEL: @_Z10fooConvertPDv4_xS0_S0_PKS_( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x half> undef, i32 4 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <16 x half> undef, i32 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x half> poison, half [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fpext <2 x half> [[TMP3]] to <2 x float> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 -; CHECK-NEXT: [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 -; CHECK-NEXT: [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP7]], i32 5 +; CHECK-NEXT: [[CONV_I_4_I:%.*]] = fpext half [[TMP0]] to float +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[CONV_I_4_I]] to i32 +; CHECK-NEXT: [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i32 4 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x half> undef, i32 5 +; CHECK-NEXT: [[CONV_I_5_I:%.*]] = fpext half [[TMP2]] to float +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[CONV_I_5_I]] to i32 +; CHECK-NEXT: [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP3]], i32 5 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -7,7 +7,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP11:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], @@ -45,11 +45,9 @@ ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = and <2 x i32> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> +; CHECK-NEXT: [[TMP11]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 -; CHECK-NEXT: [[TMP15]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 ; CHECK-NEXT: br label [[LOOP]] ; ; FORCE_REDUCTION-LABEL: @Test( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll @@ -7,50 +7,11 @@ ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 define <8 x float> @sitofp_uitofp(<8 x i32> %a) { -; SSE-LABEL: @sitofp_uitofp( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 -; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; SSE-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float -; SSE-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float -; SSE-NEXT: [[AB2:%.*]] = sitofp i32 [[A2]] to float -; SSE-NEXT: [[AB3:%.*]] = sitofp i32 [[A3]] to float -; SSE-NEXT: [[AB4:%.*]] = uitofp i32 [[A4]] to float -; SSE-NEXT: [[AB5:%.*]] = uitofp i32 [[A5]] to float -; SSE-NEXT: [[AB6:%.*]] = uitofp i32 [[A6]] to float -; SSE-NEXT: [[AB7:%.*]] = uitofp i32 [[A7]] to float -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 -; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 -; SSE-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 -; SSE-NEXT: ret <8 x float> [[R7]] -; -; SLM-LABEL: @sitofp_uitofp( -; SLM-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> -; SLM-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> -; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; SLM-NEXT: ret <8 x float> [[R7]] -; -; AVX-LABEL: @sitofp_uitofp( -; AVX-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> -; AVX-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> -; AVX-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; AVX-NEXT: ret <8 x float> [[R7]] -; -; AVX512-LABEL: @sitofp_uitofp( -; AVX512-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> -; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; AVX512-NEXT: ret <8 x float> [[R7]] +; CHECK-LABEL: @sitofp_uitofp( +; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> +; CHECK-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -164,8 +125,8 @@ ; AVX512-LABEL: @fptosi_fptoui( ; AVX512-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32> ; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32> -; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; AVX512-NEXT: ret <8 x i32> [[R7]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -250,8 +211,8 @@ ; CHECK-LABEL: @sext_zext( ; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32> -; CHECK-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[R7]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i16> %a, i32 0 %a1 = extractelement <8 x i16> %a, i32 1 @@ -362,24 +323,26 @@ ; SSE-NEXT: ret <8 x float> [[R7]] ; ; SLM-LABEL: @sitofp_uitofp_4i32_8i16_16i8( +; SLM-NEXT: [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3 ; SLM-NEXT: [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0 ; SLM-NEXT: [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1 ; SLM-NEXT: [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0 ; SLM-NEXT: [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1 -; SLM-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> -; SLM-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> +; SLM-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float +; SLM-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float +; SLM-NEXT: [[AB2:%.*]] = uitofp i32 [[A2]] to float +; SLM-NEXT: [[AB3:%.*]] = uitofp i32 [[A3]] to float ; SLM-NEXT: [[AB4:%.*]] = sitofp i16 [[B0]] to float ; SLM-NEXT: [[AB5:%.*]] = uitofp i16 [[B1]] to float ; SLM-NEXT: [[AB6:%.*]] = sitofp i8 [[C0]] to float ; SLM-NEXT: [[AB7:%.*]] = uitofp i8 [[C1]] to float -; SLM-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 -; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; SLM-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1 -; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 -; SLM-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP5]], i32 2 -; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP6]], i32 3 +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 ; SLM-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 ; SLM-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 ; SLM-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll @@ -7,50 +7,11 @@ ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 define <8 x float> @sitofp_uitofp(<8 x i32> %a) { -; SSE-LABEL: @sitofp_uitofp( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 -; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; SSE-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float -; SSE-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float -; SSE-NEXT: [[AB2:%.*]] = sitofp i32 [[A2]] to float -; SSE-NEXT: [[AB3:%.*]] = sitofp i32 [[A3]] to float -; SSE-NEXT: [[AB4:%.*]] = uitofp i32 [[A4]] to float -; SSE-NEXT: [[AB5:%.*]] = uitofp i32 [[A5]] to float -; SSE-NEXT: [[AB6:%.*]] = uitofp i32 [[A6]] to float -; SSE-NEXT: [[AB7:%.*]] = uitofp i32 [[A7]] to float -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 -; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 -; SSE-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 -; SSE-NEXT: ret <8 x float> [[R7]] -; -; SLM-LABEL: @sitofp_uitofp( -; SLM-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> -; SLM-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> -; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; SLM-NEXT: ret <8 x float> [[R7]] -; -; AVX-LABEL: @sitofp_uitofp( -; AVX-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> -; AVX-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> -; AVX-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; AVX-NEXT: ret <8 x float> [[R7]] -; -; AVX512-LABEL: @sitofp_uitofp( -; AVX512-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> -; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; AVX512-NEXT: ret <8 x float> [[R7]] +; CHECK-LABEL: @sitofp_uitofp( +; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float> +; CHECK-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -164,8 +125,8 @@ ; AVX512-LABEL: @fptosi_fptoui( ; AVX512-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32> ; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32> -; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; AVX512-NEXT: ret <8 x i32> [[R7]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -250,8 +211,8 @@ ; CHECK-LABEL: @sext_zext( ; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32> -; CHECK-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[R7]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i16> %a, i32 0 %a1 = extractelement <8 x i16> %a, i32 1 @@ -362,24 +323,26 @@ ; SSE-NEXT: ret <8 x float> [[R7]] ; ; SLM-LABEL: @sitofp_uitofp_4i32_8i16_16i8( +; SLM-NEXT: [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3 ; SLM-NEXT: [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0 ; SLM-NEXT: [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1 ; SLM-NEXT: [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0 ; SLM-NEXT: [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1 -; SLM-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float> -; SLM-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> +; SLM-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float +; SLM-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float +; SLM-NEXT: [[AB2:%.*]] = uitofp i32 [[A2]] to float +; SLM-NEXT: [[AB3:%.*]] = uitofp i32 [[A3]] to float ; SLM-NEXT: [[AB4:%.*]] = sitofp i16 [[B0]] to float ; SLM-NEXT: [[AB5:%.*]] = uitofp i16 [[B1]] to float ; SLM-NEXT: [[AB6:%.*]] = sitofp i8 [[C0]] to float ; SLM-NEXT: [[AB7:%.*]] = uitofp i8 [[C1]] to float -; SLM-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP3]], i32 0 -; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; SLM-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1 -; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 -; SLM-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP5]], i32 2 -; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP6]], i32 3 +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 ; SLM-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 ; SLM-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 ; SLM-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll @@ -10,8 +10,8 @@ ; CHECK-LABEL: @fadd_fsub_v8f32( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] -; CHECK-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[R7]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -49,58 +49,11 @@ } define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) { -; SSE-LABEL: @fmul_fdiv_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] -; SSE-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; SSE-NEXT: ret <8 x float> [[R7]] -; -; SLM-LABEL: @fmul_fdiv_v8f32( -; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 -; SLM-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 -; SLM-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 -; SLM-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 -; SLM-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; SLM-NEXT: [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i32 0 -; SLM-NEXT: [[B1:%.*]] = extractelement <8 x float> [[B]], i32 1 -; SLM-NEXT: [[B2:%.*]] = extractelement <8 x float> [[B]], i32 2 -; SLM-NEXT: [[B3:%.*]] = extractelement <8 x float> [[B]], i32 3 -; SLM-NEXT: [[B4:%.*]] = extractelement <8 x float> [[B]], i32 4 -; SLM-NEXT: [[B5:%.*]] = extractelement <8 x float> [[B]], i32 5 -; SLM-NEXT: [[B6:%.*]] = extractelement <8 x float> [[B]], i32 6 -; SLM-NEXT: [[B7:%.*]] = extractelement <8 x float> [[B]], i32 7 -; SLM-NEXT: [[AB0:%.*]] = fmul float [[A0]], [[B0]] -; SLM-NEXT: [[AB1:%.*]] = fdiv float [[A1]], [[B1]] -; SLM-NEXT: [[AB2:%.*]] = fdiv float [[A2]], [[B2]] -; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], [[B3]] -; SLM-NEXT: [[AB4:%.*]] = fmul float [[A4]], [[B4]] -; SLM-NEXT: [[AB5:%.*]] = fdiv float [[A5]], [[B5]] -; SLM-NEXT: [[AB6:%.*]] = fdiv float [[A6]], [[B6]] -; SLM-NEXT: [[AB7:%.*]] = fmul float [[A7]], [[B7]] -; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0 -; SLM-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 -; SLM-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 -; SLM-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 -; SLM-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 -; SLM-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 -; SLM-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 -; SLM-NEXT: ret <8 x float> [[R7]] -; -; AVX-LABEL: @fmul_fdiv_v8f32( -; AVX-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] -; AVX-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; AVX-NEXT: ret <8 x float> [[R7]] -; -; AVX512-LABEL: @fmul_fdiv_v8f32( -; AVX512-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; AVX512-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] -; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; AVX512-NEXT: ret <8 x float> [[R7]] +; CHECK-LABEL: @fmul_fdiv_v8f32( +; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll @@ -10,8 +10,8 @@ ; CHECK-LABEL: @fadd_fsub_v8f32( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]] -; CHECK-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[R7]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -49,58 +49,11 @@ } define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) { -; SSE-LABEL: @fmul_fdiv_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] -; SSE-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; SSE-NEXT: ret <8 x float> [[R7]] -; -; SLM-LABEL: @fmul_fdiv_v8f32( -; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 -; SLM-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 -; SLM-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 -; SLM-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 -; SLM-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; SLM-NEXT: [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i32 0 -; SLM-NEXT: [[B1:%.*]] = extractelement <8 x float> [[B]], i32 1 -; SLM-NEXT: [[B2:%.*]] = extractelement <8 x float> [[B]], i32 2 -; SLM-NEXT: [[B3:%.*]] = extractelement <8 x float> [[B]], i32 3 -; SLM-NEXT: [[B4:%.*]] = extractelement <8 x float> [[B]], i32 4 -; SLM-NEXT: [[B5:%.*]] = extractelement <8 x float> [[B]], i32 5 -; SLM-NEXT: [[B6:%.*]] = extractelement <8 x float> [[B]], i32 6 -; SLM-NEXT: [[B7:%.*]] = extractelement <8 x float> [[B]], i32 7 -; SLM-NEXT: [[AB0:%.*]] = fmul float [[A0]], [[B0]] -; SLM-NEXT: [[AB1:%.*]] = fdiv float [[A1]], [[B1]] -; SLM-NEXT: [[AB2:%.*]] = fdiv float [[A2]], [[B2]] -; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], [[B3]] -; SLM-NEXT: [[AB4:%.*]] = fmul float [[A4]], [[B4]] -; SLM-NEXT: [[AB5:%.*]] = fdiv float [[A5]], [[B5]] -; SLM-NEXT: [[AB6:%.*]] = fdiv float [[A6]], [[B6]] -; SLM-NEXT: [[AB7:%.*]] = fmul float [[A7]], [[B7]] -; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 -; SLM-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 -; SLM-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 -; SLM-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 -; SLM-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 -; SLM-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 -; SLM-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 -; SLM-NEXT: ret <8 x float> [[R7]] -; -; AVX-LABEL: @fmul_fdiv_v8f32( -; AVX-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] -; AVX-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; AVX-NEXT: ret <8 x float> [[R7]] -; -; AVX512-LABEL: @fmul_fdiv_v8f32( -; AVX512-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; AVX512-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] -; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> -; AVX512-NEXT: ret <8 x float> [[R7]] +; CHECK-LABEL: @fmul_fdiv_v8f32( +; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll @@ -10,8 +10,8 @@ ; CHECK-LABEL: @add_sub_v8i32( ; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] -; CHECK-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[R7]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -52,8 +52,8 @@ ; CHECK-LABEL: @add_and_v4i32( ; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i32> [[A]], [[B]] -; CHECK-NEXT: [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[R3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP3]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 @@ -78,8 +78,8 @@ ; CHECK-LABEL: @add_mul_v4i32( ; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]] -; CHECK-NEXT: [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[R3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP3]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 @@ -101,44 +101,11 @@ } define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { -; SSE-LABEL: @ashr_shl_v8i32( -; SSE-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; SSE-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R7]] -; -; AVX1-LABEL: @ashr_shl_v8i32( -; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 -; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 -; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 -; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 -; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 -; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] -; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; AVX1-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] -; AVX1-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] -; AVX1-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] -; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0 -; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> -; AVX1-NEXT: ret <8 x i32> [[R7]] -; -; AVX2-LABEL: @ashr_shl_v8i32( -; AVX2-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] -; AVX2-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; AVX2-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; AVX2-NEXT: ret <8 x i32> [[R7]] -; -; AVX512-LABEL: @ashr_shl_v8i32( -; AVX512-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] -; AVX512-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; AVX512-NEXT: ret <8 x i32> [[R7]] +; CHECK-LABEL: @ashr_shl_v8i32( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -176,14 +143,6 @@ } define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { -; SSE-LABEL: @ashr_shl_v8i32_const( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], -; SSE-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R7]] -; ; AVX1-LABEL: @ashr_shl_v8i32_const( ; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> ; AVX1-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], @@ -195,14 +154,14 @@ ; AVX2-LABEL: @ashr_shl_v8i32_const( ; AVX2-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], ; AVX2-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], -; AVX2-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; AVX2-NEXT: ret <8 x i32> [[R7]] +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[TMP3]] ; ; AVX512-LABEL: @ashr_shl_v8i32_const( ; AVX512-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], ; AVX512-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], -; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; AVX512-NEXT: ret <8 x i32> [[R7]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -232,116 +191,71 @@ } define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { -; SSE-LABEL: @ashr_lshr_shl_v8i32( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; SSE-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 -; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 -; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 -; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; SSE-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] -; SSE-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; SSE-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]] -; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] -; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SSE-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2 -; SSE-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3 -; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 -; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4 -; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; SSE-NEXT: ret <8 x i32> [[R7]] -; ; AVX1-LABEL: @ashr_lshr_shl_v8i32( ; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; AVX1-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 +; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 +; AVX1-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 +; AVX1-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 ; AVX1-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 ; AVX1-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; AVX1-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[AB2:%.*]] = lshr i32 [[A2]], [[B2]] +; AVX1-NEXT: [[AB3:%.*]] = lshr i32 [[A3]], [[B3]] +; AVX1-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] +; AVX1-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] ; AVX1-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; AVX1-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0 ; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX1-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 -; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2 -; AVX1-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 -; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 -; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 -; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; AVX1-NEXT: ret <8 x i32> [[R7]] ; ; AVX2-LABEL: @ashr_lshr_shl_v8i32( -; AVX2-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6 -; AVX2-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; AVX2-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6 -; AVX2-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> ; AVX2-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; AVX2-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] -; AVX2-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]] -; AVX2-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] -; AVX2-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 0 -; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1 -; AVX2-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 -; AVX2-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2 -; AVX2-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 -; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3 -; AVX2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4 -; AVX2-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4 -; AVX2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5 -; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5 -; AVX2-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; AVX2-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP7:%.*]] = lshr <4 x i32> [[TMP5]], [[TMP6]] +; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <8 x i32> +; AVX2-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP5]], [[TMP6]] +; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <8 x i32> +; AVX2-NEXT: [[R3:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> +; AVX2-NEXT: [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP8]], <8 x i32> +; AVX2-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP10]], <8 x i32> ; AVX2-NEXT: ret <8 x i32> [[R7]] ; ; AVX512-LABEL: @ashr_lshr_shl_v8i32( -; AVX512-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6 -; AVX512-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; AVX512-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6 -; AVX512-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> ; AVX512-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; AVX512-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] -; AVX512-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]] -; AVX512-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] -; AVX512-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX512-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 0 -; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1 -; AVX512-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 -; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2 -; AVX512-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 -; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3 -; AVX512-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4 -; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4 -; AVX512-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5 -; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5 -; AVX512-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; AVX512-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX512-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX512-NEXT: [[TMP7:%.*]] = lshr <4 x i32> [[TMP5]], [[TMP6]] +; AVX512-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <8 x i32> +; AVX512-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP5]], [[TMP6]] +; AVX512-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <8 x i32> +; AVX512-NEXT: [[R3:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> +; AVX512-NEXT: [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP8]], <8 x i32> +; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP10]], <8 x i32> ; AVX512-NEXT: ret <8 x i32> [[R7]] ; %a0 = extractelement <8 x i32> %a, i32 0 @@ -412,26 +326,89 @@ } define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { -; CHECK-LABEL: @sdiv_v8i32_undefs( -; CHECK-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; CHECK-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; CHECK-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; CHECK-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; CHECK-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 -; CHECK-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 -; CHECK-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; CHECK-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 -; CHECK-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[AB2]], i32 2 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[AB3]], i32 3 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; CHECK-NEXT: ret <8 x i32> [[R7]] +; SSE-LABEL: @sdiv_v8i32_undefs( +; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SSE-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; SSE-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; SSE-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; SSE-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; SSE-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; SSE-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SSE-NEXT: ret <8 x i32> [[R7]] +; +; AVX1-LABEL: @sdiv_v8i32_undefs( +; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX1-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; AVX1-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; AVX1-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; AVX1-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; AVX1-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; AVX1-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 +; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX1-NEXT: ret <8 x i32> [[R7]] +; +; AVX2-LABEL: @sdiv_v8i32_undefs( +; AVX2-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 3 +; AVX2-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], +; AVX2-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], +; AVX2-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; AVX2-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP6]], i32 2 +; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP7]], i32 5 +; AVX2-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; AVX2-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[TMP8]], i32 6 +; AVX2-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX2-NEXT: ret <8 x i32> [[R7]] +; +; AVX512-LABEL: @sdiv_v8i32_undefs( +; AVX512-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 3 +; AVX512-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], +; AVX512-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], +; AVX512-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP6]], i32 2 +; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP7]], i32 5 +; AVX512-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; AVX512-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[TMP8]], i32 6 +; AVX512-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX512-NEXT: ret <8 x i32> [[R7]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -466,8 +443,8 @@ ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] ; CHECK-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] -; CHECK-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[R7]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP5]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -10,8 +10,8 @@ ; CHECK-LABEL: @add_sub_v8i32( ; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]] -; CHECK-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[R7]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -52,8 +52,8 @@ ; CHECK-LABEL: @add_and_v4i32( ; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i32> [[A]], [[B]] -; CHECK-NEXT: [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[R3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP3]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 @@ -78,8 +78,8 @@ ; CHECK-LABEL: @add_mul_v4i32( ; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]] -; CHECK-NEXT: [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[R3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP3]] ; %a0 = extractelement <4 x i32> %a, i32 0 %a1 = extractelement <4 x i32> %a, i32 1 @@ -101,44 +101,11 @@ } define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { -; SSE-LABEL: @ashr_shl_v8i32( -; SSE-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; SSE-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R7]] -; -; AVX1-LABEL: @ashr_shl_v8i32( -; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 -; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 -; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 -; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 -; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 -; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] -; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; AVX1-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] -; AVX1-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] -; AVX1-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] -; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> -; AVX1-NEXT: ret <8 x i32> [[R7]] -; -; AVX2-LABEL: @ashr_shl_v8i32( -; AVX2-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] -; AVX2-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; AVX2-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; AVX2-NEXT: ret <8 x i32> [[R7]] -; -; AVX512-LABEL: @ashr_shl_v8i32( -; AVX512-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] -; AVX512-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; AVX512-NEXT: ret <8 x i32> [[R7]] +; CHECK-LABEL: @ashr_shl_v8i32( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -176,14 +143,6 @@ } define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { -; SSE-LABEL: @ashr_shl_v8i32_const( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], -; SSE-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R7]] -; ; AVX1-LABEL: @ashr_shl_v8i32_const( ; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> ; AVX1-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], @@ -195,14 +154,14 @@ ; AVX2-LABEL: @ashr_shl_v8i32_const( ; AVX2-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], ; AVX2-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], -; AVX2-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; AVX2-NEXT: ret <8 x i32> [[R7]] +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[TMP3]] ; ; AVX512-LABEL: @ashr_shl_v8i32_const( ; AVX512-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], ; AVX512-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], -; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; AVX512-NEXT: ret <8 x i32> [[R7]] +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -232,116 +191,71 @@ } define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { -; SSE-LABEL: @ashr_lshr_shl_v8i32( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; SSE-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 -; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 -; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 -; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; SSE-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] -; SSE-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; SSE-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]] -; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] -; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SSE-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2 -; SSE-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3 -; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 -; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4 -; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; SSE-NEXT: ret <8 x i32> [[R7]] -; ; AVX1-LABEL: @ashr_lshr_shl_v8i32( ; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; AVX1-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 +; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 +; AVX1-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 +; AVX1-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 ; AVX1-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 ; AVX1-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; AVX1-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[AB2:%.*]] = lshr i32 [[A2]], [[B2]] +; AVX1-NEXT: [[AB3:%.*]] = lshr i32 [[A3]], [[B3]] +; AVX1-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] +; AVX1-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] ; AVX1-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; AVX1-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 ; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX1-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 -; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2 -; AVX1-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 -; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 -; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 -; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; AVX1-NEXT: ret <8 x i32> [[R7]] ; ; AVX2-LABEL: @ashr_lshr_shl_v8i32( -; AVX2-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6 -; AVX2-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; AVX2-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6 -; AVX2-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> ; AVX2-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; AVX2-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] -; AVX2-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]] -; AVX2-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] -; AVX2-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0 -; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1 -; AVX2-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 -; AVX2-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2 -; AVX2-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 -; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3 -; AVX2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4 -; AVX2-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4 -; AVX2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5 -; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5 -; AVX2-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; AVX2-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP7:%.*]] = lshr <4 x i32> [[TMP5]], [[TMP6]] +; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <8 x i32> +; AVX2-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP5]], [[TMP6]] +; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <8 x i32> +; AVX2-NEXT: [[R3:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> +; AVX2-NEXT: [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP8]], <8 x i32> +; AVX2-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP10]], <8 x i32> ; AVX2-NEXT: ret <8 x i32> [[R7]] ; ; AVX512-LABEL: @ashr_lshr_shl_v8i32( -; AVX512-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6 -; AVX512-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; AVX512-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6 -; AVX512-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> ; AVX512-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; AVX512-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] -; AVX512-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]] -; AVX512-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] -; AVX512-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX512-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0 -; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1 -; AVX512-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 -; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2 -; AVX512-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 -; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3 -; AVX512-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4 -; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4 -; AVX512-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5 -; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5 -; AVX512-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; AVX512-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX512-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX512-NEXT: [[TMP7:%.*]] = lshr <4 x i32> [[TMP5]], [[TMP6]] +; AVX512-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <8 x i32> +; AVX512-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP5]], [[TMP6]] +; AVX512-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <8 x i32> +; AVX512-NEXT: [[R3:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> +; AVX512-NEXT: [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP8]], <8 x i32> +; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP10]], <8 x i32> ; AVX512-NEXT: ret <8 x i32> [[R7]] ; %a0 = extractelement <8 x i32> %a, i32 0 @@ -412,26 +326,89 @@ } define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { -; CHECK-LABEL: @sdiv_v8i32_undefs( -; CHECK-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; CHECK-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; CHECK-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; CHECK-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; CHECK-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 -; CHECK-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 -; CHECK-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; CHECK-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 -; CHECK-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i32 1 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; CHECK-NEXT: ret <8 x i32> [[R7]] +; SSE-LABEL: @sdiv_v8i32_undefs( +; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SSE-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; SSE-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; SSE-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; SSE-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; SSE-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; SSE-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i32 1 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SSE-NEXT: ret <8 x i32> [[R7]] +; +; AVX1-LABEL: @sdiv_v8i32_undefs( +; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX1-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; AVX1-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; AVX1-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; AVX1-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; AVX1-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; AVX1-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i32 1 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 +; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX1-NEXT: ret <8 x i32> [[R7]] +; +; AVX2-LABEL: @sdiv_v8i32_undefs( +; AVX2-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 3 +; AVX2-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], +; AVX2-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], +; AVX2-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; AVX2-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP6]], i32 2 +; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP7]], i32 5 +; AVX2-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; AVX2-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[TMP8]], i32 6 +; AVX2-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX2-NEXT: ret <8 x i32> [[R7]] +; +; AVX512-LABEL: @sdiv_v8i32_undefs( +; AVX512-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 3 +; AVX512-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], +; AVX512-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], +; AVX512-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[TMP5]], i32 1 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP6]], i32 2 +; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP7]], i32 5 +; AVX512-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; AVX512-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[TMP8]], i32 6 +; AVX512-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX512-NEXT: ret <8 x i32> [[R7]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -466,8 +443,8 @@ ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] ; CHECK-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] -; CHECK-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[R7]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP5]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll @@ -14,11 +14,7 @@ define <2 x double> @buildvector_add_2f64(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: @buildvector_add_2f64( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1 -; CHECK-NEXT: ret <2 x double> [[R1]] +; CHECK-NEXT: ret <2 x double> [[TMP1]] ; %a0 = extractelement <2 x double> %a, i32 0 %a1 = extractelement <2 x double> %a, i32 1 @@ -34,11 +30,7 @@ define <2 x double> @buildvector_sub_2f64(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: @buildvector_sub_2f64( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1 -; CHECK-NEXT: ret <2 x double> [[R1]] +; CHECK-NEXT: ret <2 x double> [[TMP1]] ; %a0 = extractelement <2 x double> %a, i32 0 %a1 = extractelement <2 x double> %a, i32 1 @@ -54,11 +46,7 @@ define <2 x double> @buildvector_mul_2f64(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: @buildvector_mul_2f64( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1 -; CHECK-NEXT: ret <2 x double> [[R1]] +; CHECK-NEXT: ret <2 x double> [[TMP1]] ; %a0 = extractelement <2 x double> %a, i32 0 %a1 = extractelement <2 x double> %a, i32 1 @@ -74,11 +62,7 @@ define <2 x double> @buildvector_div_2f64(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: @buildvector_div_2f64( ; SSE-NEXT: [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; SSE-NEXT: [[R0:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0 -; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; SSE-NEXT: [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1 -; SSE-NEXT: ret <2 x double> [[R1]] +; SSE-NEXT: ret <2 x double> [[TMP1]] ; ; SLM-LABEL: @buildvector_div_2f64( ; SLM-NEXT: [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0 @@ -93,19 +77,11 @@ ; ; AVX-LABEL: @buildvector_div_2f64( ; AVX-NEXT: [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; AVX-NEXT: [[R0:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; AVX-NEXT: [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1 -; AVX-NEXT: ret <2 x double> [[R1]] +; AVX-NEXT: ret <2 x double> [[TMP1]] ; ; AVX512-LABEL: @buildvector_div_2f64( ; AVX512-NEXT: [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]] -; AVX512-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; AVX512-NEXT: [[R0:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0 -; AVX512-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; AVX512-NEXT: [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1 -; AVX512-NEXT: ret <2 x double> [[R1]] +; AVX512-NEXT: ret <2 x double> [[TMP1]] ; %a0 = extractelement <2 x double> %a, i32 0 %a1 = extractelement <2 x double> %a, i32 1 @@ -121,15 +97,7 @@ define <4 x float> @buildvector_add_4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @buildvector_add_4f32( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[R3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 @@ -153,15 +121,7 @@ define <4 x float> @buildvector_sub_4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @buildvector_sub_4f32( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[R3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 @@ -185,15 +145,7 @@ define <4 x float> @buildvector_mul_4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @buildvector_mul_4f32( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[R3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 @@ -217,15 +169,7 @@ define <4 x float> @buildvector_div_4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @buildvector_div_4f32( ; CHECK-NEXT: [[TMP1:%.*]] = fdiv <4 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[R3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 @@ -253,15 +197,7 @@ define <4 x double> @buildvector_add_4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: @buildvector_add_4f64( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x double> [[R3]] +; CHECK-NEXT: ret <4 x double> [[TMP1]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 @@ -285,15 +221,7 @@ define <4 x double> @buildvector_sub_4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: @buildvector_sub_4f64( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x double> [[R3]] +; CHECK-NEXT: ret <4 x double> [[TMP1]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 @@ -317,15 +245,7 @@ define <4 x double> @buildvector_mul_4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: @buildvector_mul_4f64( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x double> [[R3]] +; CHECK-NEXT: ret <4 x double> [[TMP1]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 @@ -349,15 +269,7 @@ define <4 x double> @buildvector_div_4f64(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @buildvector_div_4f64( ; SSE-NEXT: [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0 -; SSE-NEXT: [[R0:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0 -; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1 -; SSE-NEXT: [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1 -; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2 -; SSE-NEXT: [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2 -; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3 -; SSE-NEXT: [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3 -; SSE-NEXT: ret <4 x double> [[R3]] +; SSE-NEXT: ret <4 x double> [[TMP1]] ; ; SLM-LABEL: @buildvector_div_4f64( ; SLM-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0 @@ -380,27 +292,11 @@ ; ; AVX-LABEL: @buildvector_div_4f64( ; AVX-NEXT: [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0 -; AVX-NEXT: [[R0:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1 -; AVX-NEXT: [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1 -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2 -; AVX-NEXT: [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3 -; AVX-NEXT: [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3 -; AVX-NEXT: ret <4 x double> [[R3]] +; AVX-NEXT: ret <4 x double> [[TMP1]] ; ; AVX512-LABEL: @buildvector_div_4f64( ; AVX512-NEXT: [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]] -; AVX512-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0 -; AVX512-NEXT: [[R0:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0 -; AVX512-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1 -; AVX512-NEXT: [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1 -; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2 -; AVX512-NEXT: [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2 -; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3 -; AVX512-NEXT: [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3 -; AVX512-NEXT: ret <4 x double> [[R3]] +; AVX512-NEXT: ret <4 x double> [[TMP1]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 @@ -424,23 +320,7 @@ define <8 x float> @buildvector_add_8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: @buildvector_add_8f32( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7 -; CHECK-NEXT: ret <8 x float> [[R7]] +; CHECK-NEXT: ret <8 x float> [[TMP1]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -480,23 +360,7 @@ define <8 x float> @buildvector_sub_8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: @buildvector_sub_8f32( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7 -; CHECK-NEXT: ret <8 x float> [[R7]] +; CHECK-NEXT: ret <8 x float> [[TMP1]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -536,23 +400,7 @@ define <8 x float> @buildvector_mul_8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: @buildvector_mul_8f32( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7 -; CHECK-NEXT: ret <8 x float> [[R7]] +; CHECK-NEXT: ret <8 x float> [[TMP1]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -592,23 +440,7 @@ define <8 x float> @buildvector_div_8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: @buildvector_div_8f32( ; CHECK-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7 -; CHECK-NEXT: ret <8 x float> [[R7]] +; CHECK-NEXT: ret <8 x float> [[TMP1]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -652,23 +484,7 @@ define <8 x double> @buildvector_add_8f64(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: @buildvector_add_8f64( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x double> poison, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7 -; CHECK-NEXT: ret <8 x double> [[R7]] +; CHECK-NEXT: ret <8 x double> [[TMP1]] ; %a0 = extractelement <8 x double> %a, i32 0 %a1 = extractelement <8 x double> %a, i32 1 @@ -708,23 +524,7 @@ define <8 x double> @buildvector_sub_8f64(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: @buildvector_sub_8f64( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x double> poison, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7 -; CHECK-NEXT: ret <8 x double> [[R7]] +; CHECK-NEXT: ret <8 x double> [[TMP1]] ; %a0 = extractelement <8 x double> %a, i32 0 %a1 = extractelement <8 x double> %a, i32 1 @@ -764,23 +564,7 @@ define <8 x double> @buildvector_mul_8f64(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: @buildvector_mul_8f64( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x double> poison, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7 -; CHECK-NEXT: ret <8 x double> [[R7]] +; CHECK-NEXT: ret <8 x double> [[TMP1]] ; %a0 = extractelement <8 x double> %a, i32 0 %a1 = extractelement <8 x double> %a, i32 1 @@ -820,23 +604,7 @@ define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) { ; SSE-LABEL: @buildvector_div_8f64( ; SSE-NEXT: [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0 -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x double> poison, double [[TMP2]], i32 0 -; SSE-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1 -; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2 -; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3 -; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4 -; SSE-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4 -; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5 -; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6 -; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7 -; SSE-NEXT: [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7 -; SSE-NEXT: ret <8 x double> [[R7]] +; SSE-NEXT: ret <8 x double> [[TMP1]] ; ; SLM-LABEL: @buildvector_div_8f64( ; SLM-NEXT: [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0 @@ -875,43 +643,11 @@ ; ; AVX-LABEL: @buildvector_div_8f64( ; AVX-NEXT: [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0 -; AVX-NEXT: [[R0:%.*]] = insertelement <8 x double> poison, double [[TMP2]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1 -; AVX-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1 -; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2 -; AVX-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3 -; AVX-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4 -; AVX-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5 -; AVX-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5 -; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6 -; AVX-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6 -; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7 -; AVX-NEXT: [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7 -; AVX-NEXT: ret <8 x double> [[R7]] +; AVX-NEXT: ret <8 x double> [[TMP1]] ; ; AVX512-LABEL: @buildvector_div_8f64( ; AVX512-NEXT: [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]] -; AVX512-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0 -; AVX512-NEXT: [[R0:%.*]] = insertelement <8 x double> poison, double [[TMP2]], i32 0 -; AVX512-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1 -; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1 -; AVX512-NEXT: [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2 -; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2 -; AVX512-NEXT: [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3 -; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3 -; AVX512-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4 -; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4 -; AVX512-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5 -; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5 -; AVX512-NEXT: [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6 -; AVX512-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6 -; AVX512-NEXT: [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7 -; AVX512-NEXT: [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7 -; AVX512-NEXT: ret <8 x double> [[R7]] +; AVX512-NEXT: ret <8 x double> [[TMP1]] ; %a0 = extractelement <8 x double> %a, i32 0 %a1 = extractelement <8 x double> %a, i32 1 @@ -951,39 +687,7 @@ define <16 x float> @buildvector_add_16f32(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: @buildvector_add_16f32( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <16 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8 -; CHECK-NEXT: [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9 -; CHECK-NEXT: [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10 -; CHECK-NEXT: [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11 -; CHECK-NEXT: [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12 -; CHECK-NEXT: [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13 -; CHECK-NEXT: [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14 -; CHECK-NEXT: [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15 -; CHECK-NEXT: [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15 -; CHECK-NEXT: ret <16 x float> [[R15]] +; CHECK-NEXT: ret <16 x float> [[TMP1]] ; %a0 = extractelement <16 x float> %a, i32 0 %a1 = extractelement <16 x float> %a, i32 1 @@ -1055,39 +759,7 @@ define <16 x float> @buildvector_sub_16f32(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: @buildvector_sub_16f32( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <16 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8 -; CHECK-NEXT: [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9 -; CHECK-NEXT: [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10 -; CHECK-NEXT: [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11 -; CHECK-NEXT: [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12 -; CHECK-NEXT: [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13 -; CHECK-NEXT: [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14 -; CHECK-NEXT: [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15 -; CHECK-NEXT: [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15 -; CHECK-NEXT: ret <16 x float> [[R15]] +; CHECK-NEXT: ret <16 x float> [[TMP1]] ; %a0 = extractelement <16 x float> %a, i32 0 %a1 = extractelement <16 x float> %a, i32 1 @@ -1159,39 +831,7 @@ define <16 x float> @buildvector_mul_16f32(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: @buildvector_mul_16f32( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <16 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8 -; CHECK-NEXT: [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9 -; CHECK-NEXT: [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10 -; CHECK-NEXT: [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11 -; CHECK-NEXT: [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12 -; CHECK-NEXT: [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13 -; CHECK-NEXT: [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14 -; CHECK-NEXT: [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15 -; CHECK-NEXT: [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15 -; CHECK-NEXT: ret <16 x float> [[R15]] +; CHECK-NEXT: ret <16 x float> [[TMP1]] ; %a0 = extractelement <16 x float> %a, i32 0 %a1 = extractelement <16 x float> %a, i32 1 @@ -1263,39 +903,7 @@ define <16 x float> @buildvector_div_16f32(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: @buildvector_div_16f32( ; CHECK-NEXT: [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <16 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8 -; CHECK-NEXT: [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9 -; CHECK-NEXT: [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10 -; CHECK-NEXT: [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11 -; CHECK-NEXT: [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12 -; CHECK-NEXT: [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13 -; CHECK-NEXT: [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14 -; CHECK-NEXT: [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15 -; CHECK-NEXT: [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15 -; CHECK-NEXT: ret <16 x float> [[R15]] +; CHECK-NEXT: ret <16 x float> [[TMP1]] ; %a0 = extractelement <16 x float> %a, i32 0 %a1 = extractelement <16 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll @@ -14,11 +14,7 @@ define <2 x double> @buildvector_add_2f64(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: @buildvector_add_2f64( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1 -; CHECK-NEXT: ret <2 x double> [[R1]] +; CHECK-NEXT: ret <2 x double> [[TMP1]] ; %a0 = extractelement <2 x double> %a, i32 0 %a1 = extractelement <2 x double> %a, i32 1 @@ -34,11 +30,7 @@ define <2 x double> @buildvector_sub_2f64(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: @buildvector_sub_2f64( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1 -; CHECK-NEXT: ret <2 x double> [[R1]] +; CHECK-NEXT: ret <2 x double> [[TMP1]] ; %a0 = extractelement <2 x double> %a, i32 0 %a1 = extractelement <2 x double> %a, i32 1 @@ -54,11 +46,7 @@ define <2 x double> @buildvector_mul_2f64(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: @buildvector_mul_2f64( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1 -; CHECK-NEXT: ret <2 x double> [[R1]] +; CHECK-NEXT: ret <2 x double> [[TMP1]] ; %a0 = extractelement <2 x double> %a, i32 0 %a1 = extractelement <2 x double> %a, i32 1 @@ -74,11 +62,7 @@ define <2 x double> @buildvector_div_2f64(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: @buildvector_div_2f64( ; SSE-NEXT: [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; SSE-NEXT: [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 -; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; SSE-NEXT: [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1 -; SSE-NEXT: ret <2 x double> [[R1]] +; SSE-NEXT: ret <2 x double> [[TMP1]] ; ; SLM-LABEL: @buildvector_div_2f64( ; SLM-NEXT: [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0 @@ -93,19 +77,11 @@ ; ; AVX-LABEL: @buildvector_div_2f64( ; AVX-NEXT: [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; AVX-NEXT: [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; AVX-NEXT: [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1 -; AVX-NEXT: ret <2 x double> [[R1]] +; AVX-NEXT: ret <2 x double> [[TMP1]] ; ; AVX512-LABEL: @buildvector_div_2f64( ; AVX512-NEXT: [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]] -; AVX512-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; AVX512-NEXT: [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 -; AVX512-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; AVX512-NEXT: [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1 -; AVX512-NEXT: ret <2 x double> [[R1]] +; AVX512-NEXT: ret <2 x double> [[TMP1]] ; %a0 = extractelement <2 x double> %a, i32 0 %a1 = extractelement <2 x double> %a, i32 1 @@ -121,15 +97,7 @@ define <4 x float> @buildvector_add_4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @buildvector_add_4f32( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[R3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 @@ -153,15 +121,7 @@ define <4 x float> @buildvector_sub_4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @buildvector_sub_4f32( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[R3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 @@ -185,15 +145,7 @@ define <4 x float> @buildvector_mul_4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @buildvector_mul_4f32( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[R3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 @@ -217,15 +169,7 @@ define <4 x float> @buildvector_div_4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @buildvector_div_4f32( ; CHECK-NEXT: [[TMP1:%.*]] = fdiv <4 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[R3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 @@ -253,15 +197,7 @@ define <4 x double> @buildvector_add_4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: @buildvector_add_4f64( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x double> [[R3]] +; CHECK-NEXT: ret <4 x double> [[TMP1]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 @@ -285,15 +221,7 @@ define <4 x double> @buildvector_sub_4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: @buildvector_sub_4f64( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x double> [[R3]] +; CHECK-NEXT: ret <4 x double> [[TMP1]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 @@ -317,15 +245,7 @@ define <4 x double> @buildvector_mul_4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: @buildvector_mul_4f64( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x double> [[R3]] +; CHECK-NEXT: ret <4 x double> [[TMP1]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 @@ -349,15 +269,7 @@ define <4 x double> @buildvector_div_4f64(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @buildvector_div_4f64( ; SSE-NEXT: [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0 -; SSE-NEXT: [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0 -; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1 -; SSE-NEXT: [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1 -; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2 -; SSE-NEXT: [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2 -; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3 -; SSE-NEXT: [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3 -; SSE-NEXT: ret <4 x double> [[R3]] +; SSE-NEXT: ret <4 x double> [[TMP1]] ; ; SLM-LABEL: @buildvector_div_4f64( ; SLM-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0 @@ -380,27 +292,11 @@ ; ; AVX-LABEL: @buildvector_div_4f64( ; AVX-NEXT: [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0 -; AVX-NEXT: [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1 -; AVX-NEXT: [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1 -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2 -; AVX-NEXT: [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3 -; AVX-NEXT: [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3 -; AVX-NEXT: ret <4 x double> [[R3]] +; AVX-NEXT: ret <4 x double> [[TMP1]] ; ; AVX512-LABEL: @buildvector_div_4f64( ; AVX512-NEXT: [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]] -; AVX512-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0 -; AVX512-NEXT: [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0 -; AVX512-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1 -; AVX512-NEXT: [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1 -; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2 -; AVX512-NEXT: [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2 -; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3 -; AVX512-NEXT: [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3 -; AVX512-NEXT: ret <4 x double> [[R3]] +; AVX512-NEXT: ret <4 x double> [[TMP1]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 @@ -424,23 +320,7 @@ define <8 x float> @buildvector_add_8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: @buildvector_add_8f32( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7 -; CHECK-NEXT: ret <8 x float> [[R7]] +; CHECK-NEXT: ret <8 x float> [[TMP1]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -480,23 +360,7 @@ define <8 x float> @buildvector_sub_8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: @buildvector_sub_8f32( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7 -; CHECK-NEXT: ret <8 x float> [[R7]] +; CHECK-NEXT: ret <8 x float> [[TMP1]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -536,23 +400,7 @@ define <8 x float> @buildvector_mul_8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: @buildvector_mul_8f32( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7 -; CHECK-NEXT: ret <8 x float> [[R7]] +; CHECK-NEXT: ret <8 x float> [[TMP1]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -592,23 +440,7 @@ define <8 x float> @buildvector_div_8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: @buildvector_div_8f32( ; CHECK-NEXT: [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7 -; CHECK-NEXT: ret <8 x float> [[R7]] +; CHECK-NEXT: ret <8 x float> [[TMP1]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -652,23 +484,7 @@ define <8 x double> @buildvector_add_8f64(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: @buildvector_add_8f64( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7 -; CHECK-NEXT: ret <8 x double> [[R7]] +; CHECK-NEXT: ret <8 x double> [[TMP1]] ; %a0 = extractelement <8 x double> %a, i32 0 %a1 = extractelement <8 x double> %a, i32 1 @@ -708,23 +524,7 @@ define <8 x double> @buildvector_sub_8f64(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: @buildvector_sub_8f64( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7 -; CHECK-NEXT: ret <8 x double> [[R7]] +; CHECK-NEXT: ret <8 x double> [[TMP1]] ; %a0 = extractelement <8 x double> %a, i32 0 %a1 = extractelement <8 x double> %a, i32 1 @@ -764,23 +564,7 @@ define <8 x double> @buildvector_mul_8f64(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: @buildvector_mul_8f64( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7 -; CHECK-NEXT: ret <8 x double> [[R7]] +; CHECK-NEXT: ret <8 x double> [[TMP1]] ; %a0 = extractelement <8 x double> %a, i32 0 %a1 = extractelement <8 x double> %a, i32 1 @@ -820,23 +604,7 @@ define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) { ; SSE-LABEL: @buildvector_div_8f64( ; SSE-NEXT: [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]] -; SSE-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0 -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0 -; SSE-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1 -; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2 -; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3 -; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4 -; SSE-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4 -; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5 -; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6 -; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7 -; SSE-NEXT: [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7 -; SSE-NEXT: ret <8 x double> [[R7]] +; SSE-NEXT: ret <8 x double> [[TMP1]] ; ; SLM-LABEL: @buildvector_div_8f64( ; SLM-NEXT: [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0 @@ -875,43 +643,11 @@ ; ; AVX-LABEL: @buildvector_div_8f64( ; AVX-NEXT: [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0 -; AVX-NEXT: [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1 -; AVX-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1 -; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2 -; AVX-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3 -; AVX-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4 -; AVX-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5 -; AVX-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5 -; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6 -; AVX-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6 -; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7 -; AVX-NEXT: [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7 -; AVX-NEXT: ret <8 x double> [[R7]] +; AVX-NEXT: ret <8 x double> [[TMP1]] ; ; AVX512-LABEL: @buildvector_div_8f64( ; AVX512-NEXT: [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]] -; AVX512-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0 -; AVX512-NEXT: [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0 -; AVX512-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1 -; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1 -; AVX512-NEXT: [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2 -; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2 -; AVX512-NEXT: [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3 -; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3 -; AVX512-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4 -; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4 -; AVX512-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5 -; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5 -; AVX512-NEXT: [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6 -; AVX512-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6 -; AVX512-NEXT: [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7 -; AVX512-NEXT: [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7 -; AVX512-NEXT: ret <8 x double> [[R7]] +; AVX512-NEXT: ret <8 x double> [[TMP1]] ; %a0 = extractelement <8 x double> %a, i32 0 %a1 = extractelement <8 x double> %a, i32 1 @@ -951,39 +687,7 @@ define <16 x float> @buildvector_add_16f32(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: @buildvector_add_16f32( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8 -; CHECK-NEXT: [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9 -; CHECK-NEXT: [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10 -; CHECK-NEXT: [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11 -; CHECK-NEXT: [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12 -; CHECK-NEXT: [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13 -; CHECK-NEXT: [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14 -; CHECK-NEXT: [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15 -; CHECK-NEXT: [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15 -; CHECK-NEXT: ret <16 x float> [[R15]] +; CHECK-NEXT: ret <16 x float> [[TMP1]] ; %a0 = extractelement <16 x float> %a, i32 0 %a1 = extractelement <16 x float> %a, i32 1 @@ -1055,39 +759,7 @@ define <16 x float> @buildvector_sub_16f32(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: @buildvector_sub_16f32( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8 -; CHECK-NEXT: [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9 -; CHECK-NEXT: [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10 -; CHECK-NEXT: [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11 -; CHECK-NEXT: [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12 -; CHECK-NEXT: [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13 -; CHECK-NEXT: [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14 -; CHECK-NEXT: [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15 -; CHECK-NEXT: [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15 -; CHECK-NEXT: ret <16 x float> [[R15]] +; CHECK-NEXT: ret <16 x float> [[TMP1]] ; %a0 = extractelement <16 x float> %a, i32 0 %a1 = extractelement <16 x float> %a, i32 1 @@ -1159,39 +831,7 @@ define <16 x float> @buildvector_mul_16f32(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: @buildvector_mul_16f32( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8 -; CHECK-NEXT: [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9 -; CHECK-NEXT: [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10 -; CHECK-NEXT: [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11 -; CHECK-NEXT: [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12 -; CHECK-NEXT: [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13 -; CHECK-NEXT: [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14 -; CHECK-NEXT: [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15 -; CHECK-NEXT: [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15 -; CHECK-NEXT: ret <16 x float> [[R15]] +; CHECK-NEXT: ret <16 x float> [[TMP1]] ; %a0 = extractelement <16 x float> %a, i32 0 %a1 = extractelement <16 x float> %a, i32 1 @@ -1263,39 +903,7 @@ define <16 x float> @buildvector_div_16f32(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: @buildvector_div_16f32( ; CHECK-NEXT: [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4 -; CHECK-NEXT: [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5 -; CHECK-NEXT: [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6 -; CHECK-NEXT: [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7 -; CHECK-NEXT: [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8 -; CHECK-NEXT: [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9 -; CHECK-NEXT: [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10 -; CHECK-NEXT: [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11 -; CHECK-NEXT: [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12 -; CHECK-NEXT: [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13 -; CHECK-NEXT: [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14 -; CHECK-NEXT: [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15 -; CHECK-NEXT: [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15 -; CHECK-NEXT: ret <16 x float> [[R15]] +; CHECK-NEXT: ret <16 x float> [[TMP1]] ; %a0 = extractelement <16 x float> %a, i32 0 %a1 = extractelement <16 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load-inseltpoison.ll @@ -13,15 +13,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* ; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP10]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 3 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i32 3 -; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[SINK:%.*]], align 16 +; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[SINK:%.*]], align 16 ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll b/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll @@ -13,15 +13,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* ; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP10]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 3 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i32 3 -; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[SINK:%.*]], align 16 +; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[SINK:%.*]], align 16 ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll @@ -487,15 +487,12 @@ define <4 x i32> @fptosi_4xf64_4i32(double %a0, double %a1, double %a2, double %a3) #0 { ; CHECK-LABEL: @fptosi_4xf64_4i32( -; CHECK-NEXT: [[CVT0:%.*]] = fptosi double [[A0:%.*]] to i32 -; CHECK-NEXT: [[CVT1:%.*]] = fptosi double [[A1:%.*]] to i32 -; CHECK-NEXT: [[CVT2:%.*]] = fptosi double [[A2:%.*]] to i32 -; CHECK-NEXT: [[CVT3:%.*]] = fptosi double [[A3:%.*]] to i32 -; CHECK-NEXT: [[RES0:%.*]] = insertelement <4 x i32> poison, i32 [[CVT0]], i32 0 -; CHECK-NEXT: [[RES1:%.*]] = insertelement <4 x i32> [[RES0]], i32 [[CVT1]], i32 1 -; CHECK-NEXT: [[RES2:%.*]] = insertelement <4 x i32> [[RES1]], i32 [[CVT2]], i32 2 -; CHECK-NEXT: [[RES3:%.*]] = insertelement <4 x i32> [[RES2]], i32 [[CVT3]], i32 3 -; CHECK-NEXT: ret <4 x i32> [[RES3]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[A0:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[A1:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[A2:%.*]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[A3:%.*]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = fptosi <4 x double> [[TMP4]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP5]] ; %cvt0 = fptosi double %a0 to i32 %cvt1 = fptosi double %a1 to i32 @@ -510,15 +507,12 @@ define <4 x i32> @fptosi_4xf32_4i32(float %a0, float %a1, float %a2, float %a3) #0 { ; CHECK-LABEL: @fptosi_4xf32_4i32( -; CHECK-NEXT: [[CVT0:%.*]] = fptosi float [[A0:%.*]] to i32 -; CHECK-NEXT: [[CVT1:%.*]] = fptosi float [[A1:%.*]] to i32 -; CHECK-NEXT: [[CVT2:%.*]] = fptosi float [[A2:%.*]] to i32 -; CHECK-NEXT: [[CVT3:%.*]] = fptosi float [[A3:%.*]] to i32 -; CHECK-NEXT: [[RES0:%.*]] = insertelement <4 x i32> poison, i32 [[CVT0]], i32 0 -; CHECK-NEXT: [[RES1:%.*]] = insertelement <4 x i32> [[RES0]], i32 [[CVT1]], i32 1 -; CHECK-NEXT: [[RES2:%.*]] = insertelement <4 x i32> [[RES1]], i32 [[CVT2]], i32 2 -; CHECK-NEXT: [[RES3:%.*]] = insertelement <4 x i32> [[RES2]], i32 [[CVT3]], i32 3 -; CHECK-NEXT: ret <4 x i32> [[RES3]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> poison, float [[A0:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[A1:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[A2:%.*]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[A3:%.*]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = fptosi <4 x float> [[TMP4]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP5]] ; %cvt0 = fptosi float %a0 to i32 %cvt1 = fptosi float %a1 to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll @@ -487,15 +487,12 @@ define <4 x i32> @fptosi_4xf64_4i32(double %a0, double %a1, double %a2, double %a3) #0 { ; CHECK-LABEL: @fptosi_4xf64_4i32( -; CHECK-NEXT: [[CVT0:%.*]] = fptosi double [[A0:%.*]] to i32 -; CHECK-NEXT: [[CVT1:%.*]] = fptosi double [[A1:%.*]] to i32 -; CHECK-NEXT: [[CVT2:%.*]] = fptosi double [[A2:%.*]] to i32 -; CHECK-NEXT: [[CVT3:%.*]] = fptosi double [[A3:%.*]] to i32 -; CHECK-NEXT: [[RES0:%.*]] = insertelement <4 x i32> undef, i32 [[CVT0]], i32 0 -; CHECK-NEXT: [[RES1:%.*]] = insertelement <4 x i32> [[RES0]], i32 [[CVT1]], i32 1 -; CHECK-NEXT: [[RES2:%.*]] = insertelement <4 x i32> [[RES1]], i32 [[CVT2]], i32 2 -; CHECK-NEXT: [[RES3:%.*]] = insertelement <4 x i32> [[RES2]], i32 [[CVT3]], i32 3 -; CHECK-NEXT: ret <4 x i32> [[RES3]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[A0:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[A1:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[A2:%.*]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[A3:%.*]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = fptosi <4 x double> [[TMP4]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP5]] ; %cvt0 = fptosi double %a0 to i32 %cvt1 = fptosi double %a1 to i32 @@ -510,15 +507,12 @@ define <4 x i32> @fptosi_4xf32_4i32(float %a0, float %a1, float %a2, float %a3) #0 { ; CHECK-LABEL: @fptosi_4xf32_4i32( -; CHECK-NEXT: [[CVT0:%.*]] = fptosi float [[A0:%.*]] to i32 -; CHECK-NEXT: [[CVT1:%.*]] = fptosi float [[A1:%.*]] to i32 -; CHECK-NEXT: [[CVT2:%.*]] = fptosi float [[A2:%.*]] to i32 -; CHECK-NEXT: [[CVT3:%.*]] = fptosi float [[A3:%.*]] to i32 -; CHECK-NEXT: [[RES0:%.*]] = insertelement <4 x i32> undef, i32 [[CVT0]], i32 0 -; CHECK-NEXT: [[RES1:%.*]] = insertelement <4 x i32> [[RES0]], i32 [[CVT1]], i32 1 -; CHECK-NEXT: [[RES2:%.*]] = insertelement <4 x i32> [[RES1]], i32 [[CVT2]], i32 2 -; CHECK-NEXT: [[RES3:%.*]] = insertelement <4 x i32> [[RES2]], i32 [[CVT3]], i32 3 -; CHECK-NEXT: ret <4 x i32> [[RES3]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> poison, float [[A0:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[A1:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[A2:%.*]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[A3:%.*]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = fptosi <4 x float> [[TMP4]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP5]] ; %cvt0 = fptosi float %a0 to i32 %cvt1 = fptosi float %a1 to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll @@ -11,28 +11,11 @@ ; define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) { -; SSE-LABEL: @test_v2f64( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: ret <2 x double> [[TMP3]] -; -; SLM-LABEL: @test_v2f64( -; SLM-NEXT: [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1 -; SLM-NEXT: [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i32 0 -; SLM-NEXT: [[B1:%.*]] = extractelement <2 x double> [[B]], i32 1 -; SLM-NEXT: [[R0:%.*]] = fadd double [[A0]], [[A1]] -; SLM-NEXT: [[R1:%.*]] = fadd double [[B0]], [[B1]] -; SLM-NEXT: [[R00:%.*]] = insertelement <2 x double> poison, double [[R0]], i32 0 -; SLM-NEXT: [[R01:%.*]] = insertelement <2 x double> [[R00]], double [[R1]], i32 1 -; SLM-NEXT: ret <2 x double> [[R01]] -; -; AVX-LABEL: @test_v2f64( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> -; AVX-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; AVX-NEXT: ret <2 x double> [[TMP3]] +; CHECK-LABEL: @test_v2f64( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret <2 x double> [[TMP3]] ; %a0 = extractelement <2 x double> %a, i32 0 %a1 = extractelement <2 x double> %a, i32 1 @@ -344,13 +327,23 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = add <4 x i16> [[TMP1]], [[TMP2]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[B:%.*]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP9:%.*]] = add <4 x i16> [[TMP7]], [[TMP8]] +; SSE-NEXT: [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> undef, <16 x i32> +; SSE-NEXT: [[TMP11:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP12:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP13:%.*]] = add <4 x i16> [[TMP11]], [[TMP12]] +; SSE-NEXT: [[TMP14:%.*]] = shufflevector <4 x i16> [[TMP13]], <4 x i16> undef, <16 x i32> +; SSE-NEXT: [[RV7:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[RV11:%.*]] = shufflevector <16 x i16> [[RV7]], <16 x i16> [[TMP10]], <16 x i32> +; SSE-NEXT: [[RV15:%.*]] = shufflevector <16 x i16> [[RV11]], <16 x i16> [[TMP14]], <16 x i32> ; SSE-NEXT: ret <16 x i16> [[RV15]] ; ; SLM-LABEL: @test_v16i16( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll @@ -11,28 +11,11 @@ ; define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) { -; SSE-LABEL: @test_v2f64( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: ret <2 x double> [[TMP3]] -; -; SLM-LABEL: @test_v2f64( -; SLM-NEXT: [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1 -; SLM-NEXT: [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i32 0 -; SLM-NEXT: [[B1:%.*]] = extractelement <2 x double> [[B]], i32 1 -; SLM-NEXT: [[R0:%.*]] = fadd double [[A0]], [[A1]] -; SLM-NEXT: [[R1:%.*]] = fadd double [[B0]], [[B1]] -; SLM-NEXT: [[R00:%.*]] = insertelement <2 x double> undef, double [[R0]], i32 0 -; SLM-NEXT: [[R01:%.*]] = insertelement <2 x double> [[R00]], double [[R1]], i32 1 -; SLM-NEXT: ret <2 x double> [[R01]] -; -; AVX-LABEL: @test_v2f64( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> -; AVX-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; AVX-NEXT: ret <2 x double> [[TMP3]] +; CHECK-LABEL: @test_v2f64( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret <2 x double> [[TMP3]] ; %a0 = extractelement <2 x double> %a, i32 0 %a1 = extractelement <2 x double> %a, i32 1 @@ -344,13 +327,23 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = add <4 x i16> [[TMP1]], [[TMP2]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[B:%.*]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP9:%.*]] = add <4 x i16> [[TMP7]], [[TMP8]] +; SSE-NEXT: [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> undef, <16 x i32> +; SSE-NEXT: [[TMP11:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP12:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP13:%.*]] = add <4 x i16> [[TMP11]], [[TMP12]] +; SSE-NEXT: [[TMP14:%.*]] = shufflevector <4 x i16> [[TMP13]], <4 x i16> undef, <16 x i32> +; SSE-NEXT: [[RV7:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[RV11:%.*]] = shufflevector <16 x i16> [[RV7]], <16 x i16> [[TMP10]], <16 x i32> +; SSE-NEXT: [[RV15:%.*]] = shufflevector <16 x i16> [[RV11]], <16 x i16> [[TMP14]], <16 x i32> ; SSE-NEXT: ret <16 x i16> [[RV15]] ; ; SLM-LABEL: @test_v16i16( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll @@ -11,28 +11,11 @@ ; define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) { -; SSE-LABEL: @test_v2f64( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: ret <2 x double> [[TMP3]] -; -; SLM-LABEL: @test_v2f64( -; SLM-NEXT: [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1 -; SLM-NEXT: [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i32 0 -; SLM-NEXT: [[B1:%.*]] = extractelement <2 x double> [[B]], i32 1 -; SLM-NEXT: [[R0:%.*]] = fsub double [[A0]], [[A1]] -; SLM-NEXT: [[R1:%.*]] = fsub double [[B0]], [[B1]] -; SLM-NEXT: [[R00:%.*]] = insertelement <2 x double> poison, double [[R0]], i32 0 -; SLM-NEXT: [[R01:%.*]] = insertelement <2 x double> [[R00]], double [[R1]], i32 1 -; SLM-NEXT: ret <2 x double> [[R01]] -; -; AVX-LABEL: @test_v2f64( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> -; AVX-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] -; AVX-NEXT: ret <2 x double> [[TMP3]] +; CHECK-LABEL: @test_v2f64( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret <2 x double> [[TMP3]] ; %a0 = extractelement <2 x double> %a, i32 0 %a1 = extractelement <2 x double> %a, i32 1 @@ -344,13 +327,23 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = sub <4 x i16> [[TMP1]], [[TMP2]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[B:%.*]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i16> [[TMP4]], [[TMP5]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP9:%.*]] = sub <4 x i16> [[TMP7]], [[TMP8]] +; SSE-NEXT: [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> undef, <16 x i32> +; SSE-NEXT: [[TMP11:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP12:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP13:%.*]] = sub <4 x i16> [[TMP11]], [[TMP12]] +; SSE-NEXT: [[TMP14:%.*]] = shufflevector <4 x i16> [[TMP13]], <4 x i16> undef, <16 x i32> +; SSE-NEXT: [[RV7:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[RV11:%.*]] = shufflevector <16 x i16> [[RV7]], <16 x i16> [[TMP10]], <16 x i32> +; SSE-NEXT: [[RV15:%.*]] = shufflevector <16 x i16> [[RV11]], <16 x i16> [[TMP14]], <16 x i32> ; SSE-NEXT: ret <16 x i16> [[RV15]] ; ; SLM-LABEL: @test_v16i16( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll @@ -11,28 +11,11 @@ ; define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) { -; SSE-LABEL: @test_v2f64( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: ret <2 x double> [[TMP3]] -; -; SLM-LABEL: @test_v2f64( -; SLM-NEXT: [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1 -; SLM-NEXT: [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i32 0 -; SLM-NEXT: [[B1:%.*]] = extractelement <2 x double> [[B]], i32 1 -; SLM-NEXT: [[R0:%.*]] = fsub double [[A0]], [[A1]] -; SLM-NEXT: [[R1:%.*]] = fsub double [[B0]], [[B1]] -; SLM-NEXT: [[R00:%.*]] = insertelement <2 x double> undef, double [[R0]], i32 0 -; SLM-NEXT: [[R01:%.*]] = insertelement <2 x double> [[R00]], double [[R1]], i32 1 -; SLM-NEXT: ret <2 x double> [[R01]] -; -; AVX-LABEL: @test_v2f64( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> -; AVX-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] -; AVX-NEXT: ret <2 x double> [[TMP3]] +; CHECK-LABEL: @test_v2f64( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret <2 x double> [[TMP3]] ; %a0 = extractelement <2 x double> %a, i32 0 %a1 = extractelement <2 x double> %a, i32 1 @@ -344,13 +327,23 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = sub <4 x i16> [[TMP1]], [[TMP2]] +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[B:%.*]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i16> [[TMP4]], [[TMP5]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP9:%.*]] = sub <4 x i16> [[TMP7]], [[TMP8]] +; SSE-NEXT: [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> undef, <16 x i32> +; SSE-NEXT: [[TMP11:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP12:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> undef, <4 x i32> +; SSE-NEXT: [[TMP13:%.*]] = sub <4 x i16> [[TMP11]], [[TMP12]] +; SSE-NEXT: [[TMP14:%.*]] = shufflevector <4 x i16> [[TMP13]], <4 x i16> undef, <16 x i32> +; SSE-NEXT: [[RV7:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[RV11:%.*]] = shufflevector <16 x i16> [[RV7]], <16 x i16> [[TMP10]], <16 x i32> +; SSE-NEXT: [[RV15:%.*]] = shufflevector <16 x i16> [[RV11]], <16 x i16> [[TMP14]], <16 x i32> ; SSE-NEXT: ret <16 x i16> [[RV15]] ; ; SLM-LABEL: @test_v16i16( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll @@ -9,15 +9,7 @@ ; CHECK-LABEL: @simple_select( ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 -; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 -; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3 -; CHECK-NEXT: ret <4 x float> [[RD]] +; CHECK-NEXT: ret <4 x float> [[TMP2]] ; %c0 = extractelement <4 x i32> %c, i32 0 %c1 = extractelement <4 x i32> %c, i32 1 @@ -131,15 +123,7 @@ ; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[SHUFFLE]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[SHUFFLE1]], <4 x float> [[SHUFFLE2]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 -; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 -; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3 -; CHECK-NEXT: ret <4 x float> [[RD]] +; CHECK-NEXT: ret <4 x float> [[TMP2]] ; %c0 = extractelement <4 x i32> %c, i32 0 %c1 = extractelement <4 x i32> %c, i32 1 @@ -176,16 +160,8 @@ ; CHECK-LABEL: @simple_select_users( ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 -; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 -; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3 -; CHECK-NEXT: call void @v4f32_user(<4 x float> [[RD]]) [[ATTR0:#.*]] -; CHECK-NEXT: ret <4 x float> [[RD]] +; CHECK-NEXT: call void @v4f32_user(<4 x float> [[TMP2]]) #[[ATTR0:[0-9]+]] +; CHECK-NEXT: ret <4 x float> [[TMP2]] ; %c0 = extractelement <4 x i32> %c, i32 0 %c1 = extractelement <4 x i32> %c, i32 1 @@ -217,45 +193,6 @@ ; Unused insertelement define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { -; CHECK-LABEL: @simple_select_no_users( -; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0 -; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1 -; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2 -; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3 -; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 -; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 -; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 -; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[C0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[B2]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1 -; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 -; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> poison, float [[TMP17]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 -; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0 -; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> poison, float [[TMP19]], i32 2 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1 -; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3 -; CHECK-NEXT: ret <4 x float> [[RD]] -; %c0 = extractelement <4 x i32> %c, i32 0 %c1 = extractelement <4 x i32> %c, i32 1 %c2 = extractelement <4 x i32> %c, i32 2 @@ -312,11 +249,7 @@ ; CHECK-LABEL: @simple_select_v2( ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[RA:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[TMP4]], i32 1 -; CHECK-NEXT: ret <2 x float> [[RB]] +; CHECK-NEXT: ret <2 x float> [[TMP2]] ; %c0 = extractelement <2 x i32> %c, i32 0 %c1 = extractelement <2 x i32> %c, i32 1 @@ -384,15 +317,7 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @reschedule_extract( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x float> [[V0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[V3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 @@ -418,15 +343,7 @@ define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @take_credit( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x float> [[V0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[V3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 @@ -456,15 +373,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[W:%.*]], i32 3 ; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], ; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP6]], i32 3 -; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x double> poison, double [[TMP7]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[TMP6]], i32 2 -; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP8]], i32 2 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x double> [[TMP6]], i32 1 -; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x double> [[TMP6]], i32 0 -; CHECK-NEXT: [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP10]], i32 0 -; CHECK-NEXT: ret <4 x double> [[I4]] +; CHECK-NEXT: ret <4 x double> [[TMP6]] ; %t0 = fadd double %w , 0.000000e+00 %t1 = fadd double %x , 1.000000e+00 @@ -484,23 +393,7 @@ define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 { ; CHECK-LABEL: @_vadd256( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 -; CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5 -; CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6 -; CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7 -; CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP9]], i32 7 -; CHECK-NEXT: ret <8 x float> [[VECINIT7_I]] +; CHECK-NEXT: ret <8 x float> [[TMP1]] ; %vecext = extractelement <8 x float> %a, i32 0 %vecext1 = extractelement <8 x float> %b, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -9,15 +9,7 @@ ; CHECK-LABEL: @simple_select( ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 -; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 -; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3 -; CHECK-NEXT: ret <4 x float> [[RD]] +; CHECK-NEXT: ret <4 x float> [[TMP2]] ; %c0 = extractelement <4 x i32> %c, i32 0 %c1 = extractelement <4 x i32> %c, i32 1 @@ -131,15 +123,7 @@ ; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[SHUFFLE]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[SHUFFLE1]], <4 x float> [[SHUFFLE2]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 -; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 -; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3 -; CHECK-NEXT: ret <4 x float> [[RD]] +; CHECK-NEXT: ret <4 x float> [[TMP2]] ; %c0 = extractelement <4 x i32> %c, i32 0 %c1 = extractelement <4 x i32> %c, i32 1 @@ -176,16 +160,8 @@ ; CHECK-LABEL: @simple_select_users( ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 -; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 -; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3 -; CHECK-NEXT: call void @v4f32_user(<4 x float> [[RD]]) [[ATTR0:#.*]] -; CHECK-NEXT: ret <4 x float> [[RD]] +; CHECK-NEXT: call void @v4f32_user(<4 x float> [[TMP2]]) #[[ATTR0:[0-9]+]] +; CHECK-NEXT: ret <4 x float> [[TMP2]] ; %c0 = extractelement <4 x i32> %c, i32 0 %c1 = extractelement <4 x i32> %c, i32 1 @@ -217,45 +193,6 @@ ; Unused insertelement define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { -; CHECK-LABEL: @simple_select_no_users( -; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0 -; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1 -; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2 -; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3 -; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 -; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 -; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 -; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[C0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[B2]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1 -; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 -; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 -; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0 -; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1 -; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3 -; CHECK-NEXT: ret <4 x float> [[RD]] -; %c0 = extractelement <4 x i32> %c, i32 0 %c1 = extractelement <4 x i32> %c, i32 1 %c2 = extractelement <4 x i32> %c, i32 2 @@ -312,11 +249,7 @@ ; CHECK-LABEL: @simple_select_v2( ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[RA:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[TMP4]], i32 1 -; CHECK-NEXT: ret <2 x float> [[RB]] +; CHECK-NEXT: ret <2 x float> [[TMP2]] ; %c0 = extractelement <2 x i32> %c, i32 0 %c1 = extractelement <2 x i32> %c, i32 1 @@ -384,15 +317,7 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @reschedule_extract( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x float> [[V0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[V3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 @@ -418,15 +343,7 @@ define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @take_credit( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x float> [[V0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[V3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 @@ -456,15 +373,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[W:%.*]], i32 3 ; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], ; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[TMP6]], i32 3 -; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP7]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x double> [[TMP6]], i32 2 -; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP8]], i32 2 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x double> [[TMP6]], i32 1 -; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x double> [[TMP6]], i32 0 -; CHECK-NEXT: [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP10]], i32 0 -; CHECK-NEXT: ret <4 x double> [[I4]] +; CHECK-NEXT: ret <4 x double> [[TMP6]] ; %t0 = fadd double %w , 0.000000e+00 %t1 = fadd double %x , 1.000000e+00 @@ -484,23 +393,7 @@ define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 { ; CHECK-LABEL: @_vadd256( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 -; CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP6]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5 -; CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP7]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6 -; CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP8]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7 -; CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP9]], i32 7 -; CHECK-NEXT: ret <8 x float> [[VECINIT7_I]] +; CHECK-NEXT: ret <8 x float> [[TMP1]] ; %vecext = extractelement <8 x float> %a, i32 0 %vecext1 = extractelement <8 x float> %b, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll @@ -54,13 +54,11 @@ ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[X0:%.*]] = load float, float* [[GEP0]], align 4 +; CHECK-NEXT: [[X1:%.*]] = load float, float* [[GEP1]], align 4 ; CHECK-NEXT: [[X2:%.*]] = load float, float* [[GEP2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP4]], i32 1 +; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> poison, float [[X0]], i32 0 +; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[X1]], i32 1 ; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[X2]], i32 2 ; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3 ; CHECK-NEXT: ret <4 x float> [[I3]] @@ -85,13 +83,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 -; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0 -; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2 -; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x float> [[I3]] +; CHECK-NEXT: ret <4 x float> [[SHUFFLE]] ; %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0 %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -54,13 +54,11 @@ ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[X0:%.*]] = load float, float* [[GEP0]], align 4 +; CHECK-NEXT: [[X1:%.*]] = load float, float* [[GEP1]], align 4 ; CHECK-NEXT: [[X2:%.*]] = load float, float* [[GEP2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP4]], i32 1 +; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[X0]], i32 0 +; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[X1]], i32 1 ; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[X2]], i32 2 ; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3 ; CHECK-NEXT: ret <4 x float> [[I3]] @@ -85,13 +83,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 -; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0 -; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2 -; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x float> [[I3]] +; CHECK-NEXT: ret <4 x float> [[SHUFFLE]] ; %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0 %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/long_chains.ll b/llvm/test/Transforms/SLPVectorizer/X86/long_chains.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/long_chains.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/long_chains.ll @@ -12,23 +12,19 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[B:%.*]] to <2 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i8> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i8> poison, i8 [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i8> [[TMP5]], i8 [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = sitofp <2 x i8> [[TMP6]] to <2 x double> +; CHECK-NEXT: [[TMP3:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double> +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], ; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], ; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], ; CHECK-NEXT: [[TMP12:%.*]] = fmul <2 x double> [[TMP11]], [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = fadd <2 x double> [[TMP12]], -; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x double> [[TMP13]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = fadd <2 x double> [[TMP14]], -; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x double> [[TMP15]], [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = fadd <2 x double> [[TMP16]], -; CHECK-NEXT: [[TMP18:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP17]], <2 x double>* [[TMP18]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP13]], <2 x double>* [[TMP14]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -183,11 +183,11 @@ ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr31599-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr31599-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr31599-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr31599-inseltpoison.ll @@ -6,11 +6,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SOURCE:%.*]] = insertelement <2 x float> poison, float undef, i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = fsub <2 x float> [[SOURCE]], [[SOURCE]] -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 -; CHECK-NEXT: [[RES1:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1 -; CHECK-NEXT: [[RES2:%.*]] = insertelement <2 x float> [[RES1]], float [[TMP2]], i32 1 -; CHECK-NEXT: ret <2 x float> [[RES2]] +; CHECK-NEXT: ret <2 x float> [[TMP0]] ; entry: %source = insertelement <2 x float> poison, float undef, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr31599.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr31599.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr31599.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr31599.ll @@ -6,11 +6,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SOURCE:%.*]] = insertelement <2 x float> undef, float undef, i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = fsub <2 x float> [[SOURCE]], [[SOURCE]] -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 -; CHECK-NEXT: [[RES1:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1 -; CHECK-NEXT: [[RES2:%.*]] = insertelement <2 x float> [[RES1]], float [[TMP2]], i32 1 -; CHECK-NEXT: ret <2 x float> [[RES2]] +; CHECK-NEXT: ret <2 x float> [[TMP0]] ; entry: %source = insertelement <2 x float> undef, float undef, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll @@ -31,15 +31,12 @@ define void @test1_vec(float %a, float %b, float %c, float %d, <4 x i32>* nocapture %p) { ; CHECK-LABEL: @test1_vec( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CONV:%.*]] = fptosi float [[A:%.*]] to i32 -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[CONV]], i32 0 -; CHECK-NEXT: [[CONV1:%.*]] = fptosi float [[B:%.*]] to i32 -; CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[CONV1]], i32 1 -; CHECK-NEXT: [[CONV3:%.*]] = fptosi float [[C:%.*]] to i32 -; CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <4 x i32> [[VECINIT2]], i32 [[CONV3]], i32 2 -; CHECK-NEXT: [[CONV5:%.*]] = fptosi float [[D:%.*]] to i32 -; CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT4]], i32 [[CONV5]], i32 3 -; CHECK-NEXT: store <4 x i32> [[VECINIT6]], <4 x i32>* [[P:%.*]], align 16, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[P:%.*]], align 16, !tbaa [[TBAA0]] ; CHECK-NEXT: ret void ; entry: @@ -84,15 +81,12 @@ define void @test2_vec(i32 %0, i32 %1, i32 %2, i32 %3, <4 x i32>* nocapture %4) { ; CHECK-LABEL: @test2_vec( -; CHECK-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP0:%.*]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = add nsw i32 [[TMP1:%.*]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP2:%.*]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP10]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = add nsw i32 [[TMP3:%.*]], 1 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i32 3 -; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP4:%.*]], align 16, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0:%.*]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP1:%.*]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP2:%.*]], i32 2 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3:%.*]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP9]], +; CHECK-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP4:%.*]], align 16, !tbaa [[TBAA0]] ; CHECK-NEXT: ret void ; %6 = add nsw i32 %0, 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr44067-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr44067-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr44067-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr44067-inseltpoison.ll @@ -9,11 +9,7 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast { { float, float } }* [[A:%.*]] to <2 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[INS1:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[INS0:%.*]] = insertelement <2 x float> [[INS1]], float [[TMP4]], i32 0 -; CHECK-NEXT: ret <2 x float> [[INS0]] +; CHECK-NEXT: ret <2 x float> [[TMP2]] ; entry: %0 = bitcast {{float, float}}* %A to <2 x float>* diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr44067.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr44067.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr44067.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr44067.ll @@ -9,11 +9,7 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast { { float, float } }* [[A:%.*]] to <2 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[INS1:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[INS0:%.*]] = insertelement <2 x float> [[INS1]], float [[TMP4]], i32 0 -; CHECK-NEXT: ret <2 x float> [[INS0]] +; CHECK-NEXT: ret <2 x float> [[TMP2]] ; entry: %0 = bitcast {{float, float}}* %A to <2 x float>* diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll @@ -12,70 +12,56 @@ ; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1 ; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0 +; CHECK-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_1_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1 +; CHECK-NEXT: [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2 ; CHECK-NEXT: [[ARRAYIDX_I_I7_2_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2 +; CHECK-NEXT: [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3 ; CHECK-NEXT: [[ARRAYIDX_I_I7_3_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 3 +; CHECK-NEXT: [[SHR_3_I_I:%.*]] = lshr i32 [[CONV31_I]], 4 ; CHECK-NEXT: [[ARRAYIDX_I_I7_4_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 4 +; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5 ; CHECK-NEXT: [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5 +; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6 ; CHECK-NEXT: [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6 +; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7 ; CHECK-NEXT: [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CONV31_I]], i32 4 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CONV31_I]], i32 5 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CONV31_I]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CONV31_I]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = lshr <8 x i32> [[TMP8]], +; CHECK-NEXT: [[SHR_7_I_I:%.*]] = lshr i32 [[CONV31_I]], 8 ; CHECK-NEXT: [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8 +; CHECK-NEXT: [[SHR_8_I_I:%.*]] = lshr i32 [[CONV31_I]], 9 ; CHECK-NEXT: [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9 +; CHECK-NEXT: [[SHR_9_I_I:%.*]] = lshr i32 [[CONV31_I]], 10 ; CHECK-NEXT: [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10 +; CHECK-NEXT: [[SHR_10_I_I:%.*]] = lshr i32 [[CONV31_I]], 11 ; CHECK-NEXT: [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV31_I]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CONV31_I]], i32 2 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3 -; CHECK-NEXT: [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], +; CHECK-NEXT: [[SHR_11_I_I:%.*]] = lshr i32 [[CONV31_I]], 12 ; CHECK-NEXT: [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12 ; CHECK-NEXT: [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13 ; CHECK-NEXT: [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13 ; CHECK-NEXT: [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14 ; CHECK-NEXT: [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14 ; CHECK-NEXT: [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1 -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10 -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13 -; CHECK-NEXT: [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15 -; CHECK-NEXT: [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8> -; CHECK-NEXT: [[TMP44:%.*]] = and <16 x i8> [[TMP43]], +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[SHR_I_I]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[SHR_1_I_I]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[SHR_2_I_I]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[SHR_3_I_I]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[SHR_4_I_I]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[SHR_5_I_I]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SHR_6_I_I]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_7_I_I]], i32 8 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_8_I_I]], i32 9 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SHR_9_I_I]], i32 10 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[SHR_10_I_I]], i32 11 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[SHR_11_I_I]], i32 12 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[SHR_12_I_I]], i32 13 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[SHR_13_I_I]], i32 14 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[SHR_14_I_I]], i32 15 +; CHECK-NEXT: [[TMP17:%.*]] = trunc <16 x i32> [[TMP16]] to <16 x i8> +; CHECK-NEXT: [[TMP18:%.*]] = and <16 x i8> [[TMP17]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15 -; CHECK-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP18]], <16 x i8>* [[TMP19]], align 1 ; CHECK-NEXT: unreachable ; CHECK: if.end50.i: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll @@ -13,24 +13,17 @@ define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) { ; SSE-LABEL: @loadext_2i8_to_2i64( ; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: ret <2 x i64> [[V1]] +; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64> +; SSE-NEXT: ret <2 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_2i8_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: ret <2 x i64> [[V1]] +; AVX-NEXT: ret <2 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %i0 = load i8, i8* %p0, align 1 @@ -43,40 +36,14 @@ } define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) { -; SSE2-LABEL: @loadext_4i8_to_4i32( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: ret <4 x i32> [[V3]] -; -; SLM-LABEL: @loadext_4i8_to_4i32( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: ret <4 x i32> [[V3]] +; SSE-LABEL: @loadext_4i8_to_4i32( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32> +; SSE-NEXT: ret <4 x i32> [[TMP3]] ; ; AVX-LABEL: @loadext_4i8_to_4i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -85,15 +52,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i32> [[V3]] +; AVX-NEXT: ret <4 x i32> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -118,19 +77,10 @@ ; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64 -; SSE-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64 -; SSE-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i64> [[V3]] +; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64> +; SSE-NEXT: ret <4 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_4i8_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -139,15 +89,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -168,68 +110,18 @@ } define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) { -; SSE2-LABEL: @loadext_8i8_to_8i16( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7 -; SSE2-NEXT: ret <8 x i16> [[V7]] -; -; SLM-LABEL: @loadext_8i8_to_8i16( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i16> poison, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7 -; SLM-NEXT: ret <8 x i16> [[V7]] +; SSE-LABEL: @loadext_8i8_to_8i16( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 +; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 +; SSE-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 +; SSE-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16> +; SSE-NEXT: ret <8 x i16> [[TMP3]] ; ; AVX-LABEL: @loadext_8i8_to_8i16( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -242,23 +134,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3 -; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 -; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4 -; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 -; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5 -; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 -; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 -; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7 -; AVX-NEXT: ret <8 x i16> [[V7]] +; AVX-NEXT: ret <8 x i16> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -295,68 +171,18 @@ } define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) { -; SSE2-LABEL: @loadext_8i8_to_8i32( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; SSE2-NEXT: ret <8 x i32> [[V7]] -; -; SLM-LABEL: @loadext_8i8_to_8i32( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i32 -; SLM-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i32 -; SLM-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i32 -; SLM-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i32 -; SLM-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 -; SLM-NEXT: ret <8 x i32> [[V7]] +; SSE-LABEL: @loadext_8i8_to_8i32( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 +; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 +; SSE-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 +; SSE-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP3]] ; ; AVX-LABEL: @loadext_8i8_to_8i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -369,23 +195,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; AVX-NEXT: ret <8 x i32> [[V7]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -422,124 +232,26 @@ } define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) { -; SSE2-LABEL: @loadext_16i8_to_16i16( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SSE2-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8 -; SSE2-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9 -; SSE2-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10 -; SSE2-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11 -; SSE2-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12 -; SSE2-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 -; SSE2-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 -; SSE2-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <16 x i16> poison, i16 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7 -; SSE2-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8 -; SSE2-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8 -; SSE2-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9 -; SSE2-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9 -; SSE2-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10 -; SSE2-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10 -; SSE2-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11 -; SSE2-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11 -; SSE2-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12 -; SSE2-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12 -; SSE2-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13 -; SSE2-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13 -; SSE2-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14 -; SSE2-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14 -; SSE2-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15 -; SSE2-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15 -; SSE2-NEXT: ret <16 x i16> [[V15]] -; -; SLM-LABEL: @loadext_16i8_to_16i16( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8 -; SLM-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9 -; SLM-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10 -; SLM-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11 -; SLM-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12 -; SLM-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 -; SLM-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 -; SLM-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[I8:%.*]] = load i8, i8* [[P8]], align 1 -; SLM-NEXT: [[I9:%.*]] = load i8, i8* [[P9]], align 1 -; SLM-NEXT: [[I10:%.*]] = load i8, i8* [[P10]], align 1 -; SLM-NEXT: [[I11:%.*]] = load i8, i8* [[P11]], align 1 -; SLM-NEXT: [[I12:%.*]] = load i8, i8* [[P12]], align 1 -; SLM-NEXT: [[I13:%.*]] = load i8, i8* [[P13]], align 1 -; SLM-NEXT: [[I14:%.*]] = load i8, i8* [[P14]], align 1 -; SLM-NEXT: [[I15:%.*]] = load i8, i8* [[P15]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i16 -; SLM-NEXT: [[X8:%.*]] = sext i8 [[I8]] to i16 -; SLM-NEXT: [[X9:%.*]] = sext i8 [[I9]] to i16 -; SLM-NEXT: [[X10:%.*]] = sext i8 [[I10]] to i16 -; SLM-NEXT: [[X11:%.*]] = sext i8 [[I11]] to i16 -; SLM-NEXT: [[X12:%.*]] = sext i8 [[I12]] to i16 -; SLM-NEXT: [[X13:%.*]] = sext i8 [[I13]] to i16 -; SLM-NEXT: [[X14:%.*]] = sext i8 [[I14]] to i16 -; SLM-NEXT: [[X15:%.*]] = sext i8 [[I15]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <16 x i16> poison, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7 -; SLM-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8 -; SLM-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9 -; SLM-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10 -; SLM-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11 -; SLM-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12 -; SLM-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13 -; SLM-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14 -; SLM-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15 -; SLM-NEXT: ret <16 x i16> [[V15]] +; SSE-LABEL: @loadext_16i8_to_16i16( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 +; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 +; SSE-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 +; SSE-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 +; SSE-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8 +; SSE-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9 +; SSE-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10 +; SSE-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11 +; SSE-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12 +; SSE-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 +; SSE-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 +; SSE-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16> +; SSE-NEXT: ret <16 x i16> [[TMP3]] ; ; AVX-LABEL: @loadext_16i8_to_16i16( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -560,39 +272,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <16 x i16> poison, i16 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3 -; AVX-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4 -; AVX-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4 -; AVX-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5 -; AVX-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5 -; AVX-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6 -; AVX-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7 -; AVX-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7 -; AVX-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8 -; AVX-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8 -; AVX-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9 -; AVX-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9 -; AVX-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10 -; AVX-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10 -; AVX-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11 -; AVX-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11 -; AVX-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12 -; AVX-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12 -; AVX-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13 -; AVX-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13 -; AVX-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14 -; AVX-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14 -; AVX-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15 -; AVX-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15 -; AVX-NEXT: ret <16 x i16> [[V15]] +; AVX-NEXT: ret <16 x i16> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -667,24 +347,17 @@ define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) { ; SSE-LABEL: @loadext_2i16_to_2i64( ; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: ret <2 x i64> [[V1]] +; SSE-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> +; SSE-NEXT: ret <2 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_2i16_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: ret <2 x i64> [[V1]] +; AVX-NEXT: ret <2 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %i0 = load i16, i16* %p0, align 1 @@ -697,40 +370,14 @@ } define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) { -; SSE2-LABEL: @loadext_4i16_to_4i32( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: ret <4 x i32> [[V3]] -; -; SLM-LABEL: @loadext_4i16_to_4i32( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: ret <4 x i32> [[V3]] +; SSE-LABEL: @loadext_4i16_to_4i32( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32> +; SSE-NEXT: ret <4 x i32> [[TMP3]] ; ; AVX-LABEL: @loadext_4i16_to_4i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -739,15 +386,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i32> [[V3]] +; AVX-NEXT: ret <4 x i32> [[TMP3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -772,19 +411,10 @@ ; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64 -; SSE-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 -; SSE-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i64> [[V3]] +; SSE-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64> +; SSE-NEXT: ret <4 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_4i16_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -793,15 +423,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -822,68 +444,18 @@ } define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) { -; SSE2-LABEL: @loadext_8i16_to_8i32( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4 -; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 -; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 -; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* -; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; SSE2-NEXT: ret <8 x i32> [[V7]] -; -; SLM-LABEL: @loadext_8i16_to_8i32( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4 -; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 -; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 -; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i16, i16* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i16, i16* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i16, i16* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i16, i16* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i32 -; SLM-NEXT: [[X4:%.*]] = sext i16 [[I4]] to i32 -; SLM-NEXT: [[X5:%.*]] = sext i16 [[I5]] to i32 -; SLM-NEXT: [[X6:%.*]] = sext i16 [[I6]] to i32 -; SLM-NEXT: [[X7:%.*]] = sext i16 [[I7]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 -; SLM-NEXT: ret <8 x i32> [[V7]] +; SSE-LABEL: @loadext_8i16_to_8i32( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4 +; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 +; SSE-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 +; SSE-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP3]] ; ; AVX-LABEL: @loadext_8i16_to_8i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -896,23 +468,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; AVX-NEXT: ret <8 x i32> [[V7]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -955,24 +511,17 @@ define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) { ; SSE-LABEL: @loadext_2i32_to_2i64( ; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; SSE-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: ret <2 x i64> [[V1]] +; SSE-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> +; SSE-NEXT: ret <2 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_2i32_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: ret <2 x i64> [[V1]] +; AVX-NEXT: ret <2 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i32, i32* %p0, i64 1 %i0 = load i32, i32* %p0, align 1 @@ -989,19 +538,10 @@ ; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 ; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 -; SSE-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64 -; SSE-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i64> [[V3]] +; SSE-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> +; SSE-NEXT: ret <4 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_4i32_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 @@ -1010,15 +550,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i32, i32* %p0, i64 1 %p2 = getelementptr inbounds i32, i32* %p0, i64 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll @@ -13,24 +13,17 @@ define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) { ; SSE-LABEL: @loadext_2i8_to_2i64( ; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: ret <2 x i64> [[V1]] +; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64> +; SSE-NEXT: ret <2 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_2i8_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: ret <2 x i64> [[V1]] +; AVX-NEXT: ret <2 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %i0 = load i8, i8* %p0, align 1 @@ -43,40 +36,14 @@ } define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) { -; SSE2-LABEL: @loadext_4i8_to_4i32( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: ret <4 x i32> [[V3]] -; -; SLM-LABEL: @loadext_4i8_to_4i32( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: ret <4 x i32> [[V3]] +; SSE-LABEL: @loadext_4i8_to_4i32( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32> +; SSE-NEXT: ret <4 x i32> [[TMP3]] ; ; AVX-LABEL: @loadext_4i8_to_4i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -85,15 +52,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i32> [[V3]] +; AVX-NEXT: ret <4 x i32> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -118,19 +77,10 @@ ; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64 -; SSE-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64 -; SSE-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i64> [[V3]] +; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64> +; SSE-NEXT: ret <4 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_4i8_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -139,15 +89,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -168,68 +110,18 @@ } define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) { -; SSE2-LABEL: @loadext_8i8_to_8i16( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7 -; SSE2-NEXT: ret <8 x i16> [[V7]] -; -; SLM-LABEL: @loadext_8i8_to_8i16( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7 -; SLM-NEXT: ret <8 x i16> [[V7]] +; SSE-LABEL: @loadext_8i8_to_8i16( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 +; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 +; SSE-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 +; SSE-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16> +; SSE-NEXT: ret <8 x i16> [[TMP3]] ; ; AVX-LABEL: @loadext_8i8_to_8i16( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -242,23 +134,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3 -; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 -; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4 -; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 -; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5 -; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 -; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 -; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7 -; AVX-NEXT: ret <8 x i16> [[V7]] +; AVX-NEXT: ret <8 x i16> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -295,68 +171,18 @@ } define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) { -; SSE2-LABEL: @loadext_8i8_to_8i32( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; SSE2-NEXT: ret <8 x i32> [[V7]] -; -; SLM-LABEL: @loadext_8i8_to_8i32( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i32 -; SLM-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i32 -; SLM-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i32 -; SLM-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i32 -; SLM-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 -; SLM-NEXT: ret <8 x i32> [[V7]] +; SSE-LABEL: @loadext_8i8_to_8i32( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 +; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 +; SSE-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 +; SSE-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP3]] ; ; AVX-LABEL: @loadext_8i8_to_8i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -369,23 +195,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; AVX-NEXT: ret <8 x i32> [[V7]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -422,124 +232,26 @@ } define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) { -; SSE2-LABEL: @loadext_16i8_to_16i16( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SSE2-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8 -; SSE2-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9 -; SSE2-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10 -; SSE2-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11 -; SSE2-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12 -; SSE2-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 -; SSE2-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 -; SSE2-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* -; SSE2-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7 -; SSE2-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8 -; SSE2-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8 -; SSE2-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9 -; SSE2-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9 -; SSE2-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10 -; SSE2-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10 -; SSE2-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11 -; SSE2-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11 -; SSE2-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12 -; SSE2-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12 -; SSE2-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13 -; SSE2-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13 -; SSE2-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14 -; SSE2-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14 -; SSE2-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15 -; SSE2-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15 -; SSE2-NEXT: ret <16 x i16> [[V15]] -; -; SLM-LABEL: @loadext_16i8_to_16i16( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 -; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 -; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 -; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8 -; SLM-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9 -; SLM-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10 -; SLM-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11 -; SLM-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12 -; SLM-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 -; SLM-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 -; SLM-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[I8:%.*]] = load i8, i8* [[P8]], align 1 -; SLM-NEXT: [[I9:%.*]] = load i8, i8* [[P9]], align 1 -; SLM-NEXT: [[I10:%.*]] = load i8, i8* [[P10]], align 1 -; SLM-NEXT: [[I11:%.*]] = load i8, i8* [[P11]], align 1 -; SLM-NEXT: [[I12:%.*]] = load i8, i8* [[P12]], align 1 -; SLM-NEXT: [[I13:%.*]] = load i8, i8* [[P13]], align 1 -; SLM-NEXT: [[I14:%.*]] = load i8, i8* [[P14]], align 1 -; SLM-NEXT: [[I15:%.*]] = load i8, i8* [[P15]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i16 -; SLM-NEXT: [[X8:%.*]] = sext i8 [[I8]] to i16 -; SLM-NEXT: [[X9:%.*]] = sext i8 [[I9]] to i16 -; SLM-NEXT: [[X10:%.*]] = sext i8 [[I10]] to i16 -; SLM-NEXT: [[X11:%.*]] = sext i8 [[I11]] to i16 -; SLM-NEXT: [[X12:%.*]] = sext i8 [[I12]] to i16 -; SLM-NEXT: [[X13:%.*]] = sext i8 [[I13]] to i16 -; SLM-NEXT: [[X14:%.*]] = sext i8 [[I14]] to i16 -; SLM-NEXT: [[X15:%.*]] = sext i8 [[I15]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7 -; SLM-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8 -; SLM-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9 -; SLM-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10 -; SLM-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11 -; SLM-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12 -; SLM-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13 -; SLM-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14 -; SLM-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15 -; SLM-NEXT: ret <16 x i16> [[V15]] +; SSE-LABEL: @loadext_16i8_to_16i16( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 +; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 +; SSE-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 +; SSE-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 +; SSE-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8 +; SSE-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9 +; SSE-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10 +; SSE-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11 +; SSE-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12 +; SSE-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 +; SSE-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 +; SSE-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16> +; SSE-NEXT: ret <16 x i16> [[TMP3]] ; ; AVX-LABEL: @loadext_16i8_to_16i16( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -560,39 +272,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3 -; AVX-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4 -; AVX-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4 -; AVX-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5 -; AVX-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5 -; AVX-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6 -; AVX-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7 -; AVX-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7 -; AVX-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8 -; AVX-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8 -; AVX-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9 -; AVX-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9 -; AVX-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10 -; AVX-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10 -; AVX-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11 -; AVX-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11 -; AVX-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12 -; AVX-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12 -; AVX-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13 -; AVX-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13 -; AVX-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14 -; AVX-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14 -; AVX-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15 -; AVX-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15 -; AVX-NEXT: ret <16 x i16> [[V15]] +; AVX-NEXT: ret <16 x i16> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -667,24 +347,17 @@ define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) { ; SSE-LABEL: @loadext_2i16_to_2i64( ; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: ret <2 x i64> [[V1]] +; SSE-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> +; SSE-NEXT: ret <2 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_2i16_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: ret <2 x i64> [[V1]] +; AVX-NEXT: ret <2 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %i0 = load i16, i16* %p0, align 1 @@ -697,40 +370,14 @@ } define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) { -; SSE2-LABEL: @loadext_4i16_to_4i32( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: ret <4 x i32> [[V3]] -; -; SLM-LABEL: @loadext_4i16_to_4i32( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: ret <4 x i32> [[V3]] +; SSE-LABEL: @loadext_4i16_to_4i32( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32> +; SSE-NEXT: ret <4 x i32> [[TMP3]] ; ; AVX-LABEL: @loadext_4i16_to_4i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -739,15 +386,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i32> [[V3]] +; AVX-NEXT: ret <4 x i32> [[TMP3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -772,19 +411,10 @@ ; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64 -; SSE-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 -; SSE-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i64> [[V3]] +; SSE-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64> +; SSE-NEXT: ret <4 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_4i16_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -793,15 +423,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -822,68 +444,18 @@ } define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) { -; SSE2-LABEL: @loadext_8i16_to_8i32( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4 -; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 -; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 -; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* -; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; SSE2-NEXT: ret <8 x i32> [[V7]] -; -; SLM-LABEL: @loadext_8i16_to_8i32( -; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4 -; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 -; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 -; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i16, i16* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i16, i16* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i16, i16* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i16, i16* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i32 -; SLM-NEXT: [[X4:%.*]] = sext i16 [[I4]] to i32 -; SLM-NEXT: [[X5:%.*]] = sext i16 [[I5]] to i32 -; SLM-NEXT: [[X6:%.*]] = sext i16 [[I6]] to i32 -; SLM-NEXT: [[X7:%.*]] = sext i16 [[I7]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 -; SLM-NEXT: ret <8 x i32> [[V7]] +; SSE-LABEL: @loadext_8i16_to_8i32( +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4 +; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 +; SSE-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 +; SSE-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 +; SSE-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP3]] ; ; AVX-LABEL: @loadext_8i16_to_8i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -896,23 +468,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; AVX-NEXT: ret <8 x i32> [[V7]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -955,24 +511,17 @@ define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) { ; SSE-LABEL: @loadext_2i32_to_2i64( ; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; SSE-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: ret <2 x i64> [[V1]] +; SSE-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> +; SSE-NEXT: ret <2 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_2i32_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: ret <2 x i64> [[V1]] +; AVX-NEXT: ret <2 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i32, i32* %p0, i64 1 %i0 = load i32, i32* %p0, align 1 @@ -989,19 +538,10 @@ ; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 ; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 -; SSE-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64 -; SSE-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i64> [[V3]] +; SSE-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; SSE-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> +; SSE-NEXT: ret <4 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_4i32_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 @@ -1010,15 +550,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i32, i32* %p0, i64 1 %p2 = getelementptr inbounds i32, i32* %p0, i64 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sign-extend-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sign-extend-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sign-extend-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sign-extend-inseltpoison.ll @@ -5,15 +5,7 @@ ; CHECK-LABEL: @sign_extend_v_v( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i16> [[LHS:%.*]] to <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0 -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 -; CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2 -; CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT3]], i32 [[TMP3]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 -; CHECK-NEXT: [[VECINIT9:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x i32> [[VECINIT9]] +; CHECK-NEXT: ret <4 x i32> [[TMP0]] ; entry: %vecext = extractelement <4 x i16> %lhs, i32 0 @@ -35,15 +27,7 @@ ; CHECK-LABEL: @truncate_v_v( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = trunc <4 x i32> [[LHS:%.*]] to <4 x i16> -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[TMP0]], i32 0 -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1 -; CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x i16> [[VECINIT]], i16 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2 -; CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i16> [[VECINIT3]], i16 [[TMP3]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3 -; CHECK-NEXT: [[VECINIT9:%.*]] = insertelement <4 x i16> [[VECINIT6]], i16 [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x i16> [[VECINIT9]] +; CHECK-NEXT: ret <4 x i16> [[TMP0]] ; entry: %vecext = extractelement <4 x i32> %lhs, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll b/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll @@ -5,15 +5,7 @@ ; CHECK-LABEL: @sign_extend_v_v( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i16> [[LHS:%.*]] to <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0 -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 -; CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2 -; CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT3]], i32 [[TMP3]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 -; CHECK-NEXT: [[VECINIT9:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x i32> [[VECINIT9]] +; CHECK-NEXT: ret <4 x i32> [[TMP0]] ; entry: %vecext = extractelement <4 x i16> %lhs, i32 0 @@ -35,15 +27,7 @@ ; CHECK-LABEL: @truncate_v_v( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = trunc <4 x i32> [[LHS:%.*]] to <4 x i16> -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[TMP0]], i32 0 -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i16> undef, i16 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1 -; CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x i16> [[VECINIT]], i16 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2 -; CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i16> [[VECINIT3]], i16 [[TMP3]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3 -; CHECK-NEXT: [[VECINIT9:%.*]] = insertelement <4 x i16> [[VECINIT6]], i16 [[TMP4]], i32 3 -; CHECK-NEXT: ret <4 x i16> [[VECINIT9]] +; CHECK-NEXT: ret <4 x i16> [[TMP0]] ; entry: %vecext = extractelement <4 x i32> %lhs, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll @@ -1283,16 +1283,24 @@ ; define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 { -; CHECK-LABEL: @sitofp_4xi32_4f64( -; CHECK-NEXT: [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to double -; CHECK-NEXT: [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to double -; CHECK-NEXT: [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to double -; CHECK-NEXT: [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to double -; CHECK-NEXT: [[RES0:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0 -; CHECK-NEXT: [[RES1:%.*]] = insertelement <4 x double> [[RES0]], double [[CVT1]], i32 1 -; CHECK-NEXT: [[RES2:%.*]] = insertelement <4 x double> [[RES1]], double [[CVT2]], i32 2 -; CHECK-NEXT: [[RES3:%.*]] = insertelement <4 x double> [[RES2]], double [[CVT3]], i32 3 -; CHECK-NEXT: ret <4 x double> [[RES3]] +; SSE-LABEL: @sitofp_4xi32_4f64( +; SSE-NEXT: [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to double +; SSE-NEXT: [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to double +; SSE-NEXT: [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to double +; SSE-NEXT: [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to double +; SSE-NEXT: [[RES0:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0 +; SSE-NEXT: [[RES1:%.*]] = insertelement <4 x double> [[RES0]], double [[CVT1]], i32 1 +; SSE-NEXT: [[RES2:%.*]] = insertelement <4 x double> [[RES1]], double [[CVT2]], i32 2 +; SSE-NEXT: [[RES3:%.*]] = insertelement <4 x double> [[RES2]], double [[CVT3]], i32 3 +; SSE-NEXT: ret <4 x double> [[RES3]] +; +; AVX-LABEL: @sitofp_4xi32_4f64( +; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A0:%.*]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[A2:%.*]], i32 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[A3:%.*]], i32 3 +; AVX-NEXT: [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x double> +; AVX-NEXT: ret <4 x double> [[TMP5]] ; %cvt0 = sitofp i32 %a0 to double %cvt1 = sitofp i32 %a1 to double @@ -1306,16 +1314,24 @@ } define <4 x float> @sitofp_4xi32_4f32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 { -; CHECK-LABEL: @sitofp_4xi32_4f32( -; CHECK-NEXT: [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to float -; CHECK-NEXT: [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to float -; CHECK-NEXT: [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to float -; CHECK-NEXT: [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to float -; CHECK-NEXT: [[RES0:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0 -; CHECK-NEXT: [[RES1:%.*]] = insertelement <4 x float> [[RES0]], float [[CVT1]], i32 1 -; CHECK-NEXT: [[RES2:%.*]] = insertelement <4 x float> [[RES1]], float [[CVT2]], i32 2 -; CHECK-NEXT: [[RES3:%.*]] = insertelement <4 x float> [[RES2]], float [[CVT3]], i32 3 -; CHECK-NEXT: ret <4 x float> [[RES3]] +; SSE-LABEL: @sitofp_4xi32_4f32( +; SSE-NEXT: [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to float +; SSE-NEXT: [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to float +; SSE-NEXT: [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to float +; SSE-NEXT: [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to float +; SSE-NEXT: [[RES0:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0 +; SSE-NEXT: [[RES1:%.*]] = insertelement <4 x float> [[RES0]], float [[CVT1]], i32 1 +; SSE-NEXT: [[RES2:%.*]] = insertelement <4 x float> [[RES1]], float [[CVT2]], i32 2 +; SSE-NEXT: [[RES3:%.*]] = insertelement <4 x float> [[RES2]], float [[CVT3]], i32 3 +; SSE-NEXT: ret <4 x float> [[RES3]] +; +; AVX-LABEL: @sitofp_4xi32_4f32( +; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A0:%.*]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[A2:%.*]], i32 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[A3:%.*]], i32 3 +; AVX-NEXT: [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float> +; AVX-NEXT: ret <4 x float> [[TMP5]] ; %cvt0 = sitofp i32 %a0 to float %cvt1 = sitofp i32 %a1 to float diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll @@ -1283,16 +1283,24 @@ ; define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 { -; CHECK-LABEL: @sitofp_4xi32_4f64( -; CHECK-NEXT: [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to double -; CHECK-NEXT: [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to double -; CHECK-NEXT: [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to double -; CHECK-NEXT: [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to double -; CHECK-NEXT: [[RES0:%.*]] = insertelement <4 x double> undef, double [[CVT0]], i32 0 -; CHECK-NEXT: [[RES1:%.*]] = insertelement <4 x double> [[RES0]], double [[CVT1]], i32 1 -; CHECK-NEXT: [[RES2:%.*]] = insertelement <4 x double> [[RES1]], double [[CVT2]], i32 2 -; CHECK-NEXT: [[RES3:%.*]] = insertelement <4 x double> [[RES2]], double [[CVT3]], i32 3 -; CHECK-NEXT: ret <4 x double> [[RES3]] +; SSE-LABEL: @sitofp_4xi32_4f64( +; SSE-NEXT: [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to double +; SSE-NEXT: [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to double +; SSE-NEXT: [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to double +; SSE-NEXT: [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to double +; SSE-NEXT: [[RES0:%.*]] = insertelement <4 x double> undef, double [[CVT0]], i32 0 +; SSE-NEXT: [[RES1:%.*]] = insertelement <4 x double> [[RES0]], double [[CVT1]], i32 1 +; SSE-NEXT: [[RES2:%.*]] = insertelement <4 x double> [[RES1]], double [[CVT2]], i32 2 +; SSE-NEXT: [[RES3:%.*]] = insertelement <4 x double> [[RES2]], double [[CVT3]], i32 3 +; SSE-NEXT: ret <4 x double> [[RES3]] +; +; AVX-LABEL: @sitofp_4xi32_4f64( +; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A0:%.*]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[A2:%.*]], i32 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[A3:%.*]], i32 3 +; AVX-NEXT: [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x double> +; AVX-NEXT: ret <4 x double> [[TMP5]] ; %cvt0 = sitofp i32 %a0 to double %cvt1 = sitofp i32 %a1 to double @@ -1306,16 +1314,24 @@ } define <4 x float> @sitofp_4xi32_4f32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 { -; CHECK-LABEL: @sitofp_4xi32_4f32( -; CHECK-NEXT: [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to float -; CHECK-NEXT: [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to float -; CHECK-NEXT: [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to float -; CHECK-NEXT: [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to float -; CHECK-NEXT: [[RES0:%.*]] = insertelement <4 x float> undef, float [[CVT0]], i32 0 -; CHECK-NEXT: [[RES1:%.*]] = insertelement <4 x float> [[RES0]], float [[CVT1]], i32 1 -; CHECK-NEXT: [[RES2:%.*]] = insertelement <4 x float> [[RES1]], float [[CVT2]], i32 2 -; CHECK-NEXT: [[RES3:%.*]] = insertelement <4 x float> [[RES2]], float [[CVT3]], i32 3 -; CHECK-NEXT: ret <4 x float> [[RES3]] +; SSE-LABEL: @sitofp_4xi32_4f32( +; SSE-NEXT: [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to float +; SSE-NEXT: [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to float +; SSE-NEXT: [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to float +; SSE-NEXT: [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to float +; SSE-NEXT: [[RES0:%.*]] = insertelement <4 x float> undef, float [[CVT0]], i32 0 +; SSE-NEXT: [[RES1:%.*]] = insertelement <4 x float> [[RES0]], float [[CVT1]], i32 1 +; SSE-NEXT: [[RES2:%.*]] = insertelement <4 x float> [[RES1]], float [[CVT2]], i32 2 +; SSE-NEXT: [[RES3:%.*]] = insertelement <4 x float> [[RES2]], float [[CVT3]], i32 3 +; SSE-NEXT: ret <4 x float> [[RES3]] +; +; AVX-LABEL: @sitofp_4xi32_4f32( +; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A0:%.*]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[A2:%.*]], i32 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[A3:%.*]], i32 3 +; AVX-NEXT: [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float> +; AVX-NEXT: ret <4 x float> [[TMP5]] ; %cvt0 = sitofp i32 %a0 to float %cvt1 = sitofp i32 %a1 to float diff --git a/llvm/test/Transforms/SLPVectorizer/X86/value-bug-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/value-bug-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/value-bug-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/value-bug-inseltpoison.ll @@ -11,32 +11,30 @@ define void @test() { ; CHECK-LABEL: @test( ; CHECK-NEXT: bb279: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float undef, i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float undef, i32 1 ; CHECK-NEXT: br label [[BB283:%.*]] ; CHECK: bb283: -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP13:%.*]], [[EXIT:%.*]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ [[TMP1]], [[EXIT]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP11:%.*]], [[EXIT:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ undef, [[EXIT]] ] ; CHECK-NEXT: br label [[BB284:%.*]] ; CHECK: bb284: -; CHECK-NEXT: [[TMP4:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double> -; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], undef -; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP5]], undef +; CHECK-NEXT: [[TMP2:%.*]] = fpext <2 x float> [[TMP0]] to <2 x double> +; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], undef +; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef ; CHECK-NEXT: br label [[BB21_I:%.*]] ; CHECK: bb21.i: ; CHECK-NEXT: br i1 undef, label [[BB22_I:%.*]], label [[EXIT]] ; CHECK: bb22.i: -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> undef, [[TMP6]] +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]] ; CHECK-NEXT: br label [[BB32_I:%.*]] ; CHECK: bb32.i: -; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x double> [ [[TMP7]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x double> [ [[TMP5]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ] ; CHECK-NEXT: br i1 undef, label [[BB32_I]], label [[BB21_I]] ; CHECK: exit: -; CHECK-NEXT: [[TMP9:%.*]] = fpext <2 x float> [[TMP3]] to <2 x double> -; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> undef, [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], undef -; CHECK-NEXT: [[TMP13]] = fptrunc <2 x double> [[TMP12]] to <2 x float> +; CHECK-NEXT: [[TMP7:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double> +; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> undef, [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x double> [[TMP9]], undef +; CHECK-NEXT: [[TMP11]] = fptrunc <2 x double> [[TMP10]] to <2 x float> ; CHECK-NEXT: br label [[BB283]] ; bb279: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll @@ -11,32 +11,30 @@ define void @test() { ; CHECK-LABEL: @test( ; CHECK-NEXT: bb279: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float undef, i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float undef, i32 1 ; CHECK-NEXT: br label [[BB283:%.*]] ; CHECK: bb283: -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP13:%.*]], [[EXIT:%.*]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ [[TMP1]], [[EXIT]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP11:%.*]], [[EXIT:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ undef, [[EXIT]] ] ; CHECK-NEXT: br label [[BB284:%.*]] ; CHECK: bb284: -; CHECK-NEXT: [[TMP4:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double> -; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], undef -; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP5]], undef +; CHECK-NEXT: [[TMP2:%.*]] = fpext <2 x float> [[TMP0]] to <2 x double> +; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], undef +; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef ; CHECK-NEXT: br label [[BB21_I:%.*]] ; CHECK: bb21.i: ; CHECK-NEXT: br i1 undef, label [[BB22_I:%.*]], label [[EXIT]] ; CHECK: bb22.i: -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> undef, [[TMP6]] +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]] ; CHECK-NEXT: br label [[BB32_I:%.*]] ; CHECK: bb32.i: -; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x double> [ [[TMP7]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x double> [ [[TMP5]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ] ; CHECK-NEXT: br i1 undef, label [[BB32_I]], label [[BB21_I]] ; CHECK: exit: -; CHECK-NEXT: [[TMP9:%.*]] = fpext <2 x float> [[TMP3]] to <2 x double> -; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> undef, [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], undef -; CHECK-NEXT: [[TMP13]] = fptrunc <2 x double> [[TMP12]] to <2 x float> +; CHECK-NEXT: [[TMP7:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double> +; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> undef, [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x double> [[TMP9]], undef +; CHECK-NEXT: [[TMP11]] = fptrunc <2 x double> [[TMP10]] to <2 x float> ; CHECK-NEXT: br label [[BB283]] ; bb279: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/zext-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/zext-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/zext-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/zext-inseltpoison.ll @@ -16,32 +16,21 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* ; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 -; SSE2-NEXT: ret <2 x i64> [[V1]] +; SSE2-NEXT: ret <2 x i64> [[TMP3]] ; ; SLM-LABEL: @loadext_2i8_to_2i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SLM-NEXT: ret <2 x i64> [[V1]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> +; SLM-NEXT: ret <2 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_2i8_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: ret <2 x i64> [[V1]] +; AVX-NEXT: ret <2 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %i0 = load i8, i8* %p0, align 1 @@ -61,33 +50,16 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* ; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: ret <4 x i32> [[V3]] +; SSE2-NEXT: ret <4 x i32> [[TMP3]] ; ; SLM-LABEL: @loadext_4i8_to_4i32( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: ret <4 x i32> [[V3]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> +; SLM-NEXT: ret <4 x i32> [[TMP3]] ; ; AVX-LABEL: @loadext_4i8_to_4i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -96,15 +68,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i32> [[V3]] +; AVX-NEXT: ret <4 x i32> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -132,33 +96,16 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* ; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; SSE2-NEXT: ret <4 x i64> [[V3]] +; SSE2-NEXT: ret <4 x i64> [[TMP3]] ; ; SLM-LABEL: @loadext_4i8_to_4i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SLM-NEXT: ret <4 x i64> [[V3]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> +; SLM-NEXT: ret <4 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_4i8_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -167,15 +114,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -207,23 +146,7 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* ; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7 -; SSE2-NEXT: ret <8 x i16> [[V7]] +; SSE2-NEXT: ret <8 x i16> [[TMP3]] ; ; SLM-LABEL: @loadext_8i8_to_8i16( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -233,31 +156,10 @@ ; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i16> poison, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7 -; SLM-NEXT: ret <8 x i16> [[V7]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16> +; SLM-NEXT: ret <8 x i16> [[TMP3]] ; ; AVX-LABEL: @loadext_8i8_to_8i16( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -270,23 +172,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3 -; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 -; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4 -; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 -; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5 -; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 -; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 -; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7 -; AVX-NEXT: ret <8 x i16> [[V7]] +; AVX-NEXT: ret <8 x i16> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -334,23 +220,7 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* ; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; SSE2-NEXT: ret <8 x i32> [[V7]] +; SSE2-NEXT: ret <8 x i32> [[TMP3]] ; ; SLM-LABEL: @loadext_8i8_to_8i32( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -360,31 +230,10 @@ ; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i32 -; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i32 -; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i32 -; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i32 -; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 -; SLM-NEXT: ret <8 x i32> [[V7]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP3]] ; ; AVX-LABEL: @loadext_8i8_to_8i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -397,23 +246,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; AVX-NEXT: ret <8 x i32> [[V7]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -469,39 +302,7 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* ; SSE2-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <16 x i16> poison, i16 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7 -; SSE2-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8 -; SSE2-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8 -; SSE2-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9 -; SSE2-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9 -; SSE2-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10 -; SSE2-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10 -; SSE2-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11 -; SSE2-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11 -; SSE2-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12 -; SSE2-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12 -; SSE2-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13 -; SSE2-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13 -; SSE2-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14 -; SSE2-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14 -; SSE2-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15 -; SSE2-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15 -; SSE2-NEXT: ret <16 x i16> [[V15]] +; SSE2-NEXT: ret <16 x i16> [[TMP3]] ; ; SLM-LABEL: @loadext_16i8_to_16i16( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -519,55 +320,10 @@ ; SLM-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 ; SLM-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 ; SLM-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[I8:%.*]] = load i8, i8* [[P8]], align 1 -; SLM-NEXT: [[I9:%.*]] = load i8, i8* [[P9]], align 1 -; SLM-NEXT: [[I10:%.*]] = load i8, i8* [[P10]], align 1 -; SLM-NEXT: [[I11:%.*]] = load i8, i8* [[P11]], align 1 -; SLM-NEXT: [[I12:%.*]] = load i8, i8* [[P12]], align 1 -; SLM-NEXT: [[I13:%.*]] = load i8, i8* [[P13]], align 1 -; SLM-NEXT: [[I14:%.*]] = load i8, i8* [[P14]], align 1 -; SLM-NEXT: [[I15:%.*]] = load i8, i8* [[P15]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i16 -; SLM-NEXT: [[X8:%.*]] = zext i8 [[I8]] to i16 -; SLM-NEXT: [[X9:%.*]] = zext i8 [[I9]] to i16 -; SLM-NEXT: [[X10:%.*]] = zext i8 [[I10]] to i16 -; SLM-NEXT: [[X11:%.*]] = zext i8 [[I11]] to i16 -; SLM-NEXT: [[X12:%.*]] = zext i8 [[I12]] to i16 -; SLM-NEXT: [[X13:%.*]] = zext i8 [[I13]] to i16 -; SLM-NEXT: [[X14:%.*]] = zext i8 [[I14]] to i16 -; SLM-NEXT: [[X15:%.*]] = zext i8 [[I15]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <16 x i16> poison, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7 -; SLM-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8 -; SLM-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9 -; SLM-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10 -; SLM-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11 -; SLM-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12 -; SLM-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13 -; SLM-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14 -; SLM-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15 -; SLM-NEXT: ret <16 x i16> [[V15]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16> +; SLM-NEXT: ret <16 x i16> [[TMP3]] ; ; AVX-LABEL: @loadext_16i8_to_16i16( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -588,39 +344,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <16 x i16> poison, i16 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3 -; AVX-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4 -; AVX-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4 -; AVX-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5 -; AVX-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5 -; AVX-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6 -; AVX-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7 -; AVX-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7 -; AVX-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8 -; AVX-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8 -; AVX-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9 -; AVX-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9 -; AVX-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10 -; AVX-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10 -; AVX-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11 -; AVX-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11 -; AVX-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12 -; AVX-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12 -; AVX-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13 -; AVX-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13 -; AVX-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14 -; AVX-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14 -; AVX-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15 -; AVX-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15 -; AVX-NEXT: ret <16 x i16> [[V15]] +; AVX-NEXT: ret <16 x i16> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -698,32 +422,21 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* ; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 -; SSE2-NEXT: ret <2 x i64> [[V1]] +; SSE2-NEXT: ret <2 x i64> [[TMP3]] ; ; SLM-LABEL: @loadext_2i16_to_2i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SLM-NEXT: ret <2 x i64> [[V1]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> +; SLM-NEXT: ret <2 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_2i16_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: ret <2 x i64> [[V1]] +; AVX-NEXT: ret <2 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %i0 = load i16, i16* %p0, align 1 @@ -743,33 +456,16 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* ; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: ret <4 x i32> [[V3]] +; SSE2-NEXT: ret <4 x i32> [[TMP3]] ; ; SLM-LABEL: @loadext_4i16_to_4i32( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: ret <4 x i32> [[V3]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> +; SLM-NEXT: ret <4 x i32> [[TMP3]] ; ; AVX-LABEL: @loadext_4i16_to_4i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -778,15 +474,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i32> [[V3]] +; AVX-NEXT: ret <4 x i32> [[TMP3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -814,33 +502,16 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* ; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; SSE2-NEXT: ret <4 x i64> [[V3]] +; SSE2-NEXT: ret <4 x i64> [[TMP3]] ; ; SLM-LABEL: @loadext_4i16_to_4i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64 -; SLM-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64 -; SLM-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SLM-NEXT: ret <4 x i64> [[V3]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> +; SLM-NEXT: ret <4 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_4i16_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -849,15 +520,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -889,23 +552,7 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* ; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; SSE2-NEXT: ret <8 x i32> [[V7]] +; SSE2-NEXT: ret <8 x i32> [[TMP3]] ; ; SLM-LABEL: @loadext_8i16_to_8i32( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -915,31 +562,10 @@ ; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 ; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 ; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i16, i16* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i16, i16* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i16, i16* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i16, i16* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i32 -; SLM-NEXT: [[X4:%.*]] = zext i16 [[I4]] to i32 -; SLM-NEXT: [[X5:%.*]] = zext i16 [[I5]] to i32 -; SLM-NEXT: [[X6:%.*]] = zext i16 [[I6]] to i32 -; SLM-NEXT: [[X7:%.*]] = zext i16 [[I7]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 -; SLM-NEXT: ret <8 x i32> [[V7]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP3]] ; ; AVX-LABEL: @loadext_8i16_to_8i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -952,23 +578,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; AVX-NEXT: ret <8 x i32> [[V7]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -1014,32 +624,21 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* ; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 -; SSE2-NEXT: ret <2 x i64> [[V1]] +; SSE2-NEXT: ret <2 x i64> [[TMP3]] ; ; SLM-LABEL: @loadext_2i32_to_2i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; SLM-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SLM-NEXT: ret <2 x i64> [[V1]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> +; SLM-NEXT: ret <2 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_2i32_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: ret <2 x i64> [[V1]] +; AVX-NEXT: ret <2 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i32, i32* %p0, i64 1 %i0 = load i32, i32* %p0, align 1 @@ -1059,33 +658,16 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* ; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; SSE2-NEXT: ret <4 x i64> [[V3]] +; SSE2-NEXT: ret <4 x i64> [[TMP3]] ; ; SLM-LABEL: @loadext_4i32_to_4i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64 -; SLM-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64 -; SLM-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SLM-NEXT: ret <4 x i64> [[V3]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> +; SLM-NEXT: ret <4 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_4i32_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 @@ -1094,15 +676,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i32, i32* %p0, i64 1 %p2 = getelementptr inbounds i32, i32* %p0, i64 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll @@ -16,32 +16,21 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* ; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 -; SSE2-NEXT: ret <2 x i64> [[V1]] +; SSE2-NEXT: ret <2 x i64> [[TMP3]] ; ; SLM-LABEL: @loadext_2i8_to_2i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SLM-NEXT: ret <2 x i64> [[V1]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> +; SLM-NEXT: ret <2 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_2i8_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: ret <2 x i64> [[V1]] +; AVX-NEXT: ret <2 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %i0 = load i8, i8* %p0, align 1 @@ -61,33 +50,16 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* ; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: ret <4 x i32> [[V3]] +; SSE2-NEXT: ret <4 x i32> [[TMP3]] ; ; SLM-LABEL: @loadext_4i8_to_4i32( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: ret <4 x i32> [[V3]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> +; SLM-NEXT: ret <4 x i32> [[TMP3]] ; ; AVX-LABEL: @loadext_4i8_to_4i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -96,15 +68,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i32> [[V3]] +; AVX-NEXT: ret <4 x i32> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -132,33 +96,16 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* ; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; SSE2-NEXT: ret <4 x i64> [[V3]] +; SSE2-NEXT: ret <4 x i64> [[TMP3]] ; ; SLM-LABEL: @loadext_4i8_to_4i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SLM-NEXT: ret <4 x i64> [[V3]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> +; SLM-NEXT: ret <4 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_4i8_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -167,15 +114,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -207,23 +146,7 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* ; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7 -; SSE2-NEXT: ret <8 x i16> [[V7]] +; SSE2-NEXT: ret <8 x i16> [[TMP3]] ; ; SLM-LABEL: @loadext_8i8_to_8i16( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -233,31 +156,10 @@ ; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7 -; SLM-NEXT: ret <8 x i16> [[V7]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16> +; SLM-NEXT: ret <8 x i16> [[TMP3]] ; ; AVX-LABEL: @loadext_8i8_to_8i16( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -270,23 +172,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3 -; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 -; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4 -; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 -; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5 -; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 -; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 -; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7 -; AVX-NEXT: ret <8 x i16> [[V7]] +; AVX-NEXT: ret <8 x i16> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -334,23 +220,7 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* ; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; SSE2-NEXT: ret <8 x i32> [[V7]] +; SSE2-NEXT: ret <8 x i32> [[TMP3]] ; ; SLM-LABEL: @loadext_8i8_to_8i32( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -360,31 +230,10 @@ ; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i32 -; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i32 -; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i32 -; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i32 -; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 -; SLM-NEXT: ret <8 x i32> [[V7]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP3]] ; ; AVX-LABEL: @loadext_8i8_to_8i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -397,23 +246,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; AVX-NEXT: ret <8 x i32> [[V7]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -469,39 +302,7 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* ; SSE2-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7 -; SSE2-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8 -; SSE2-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8 -; SSE2-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9 -; SSE2-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9 -; SSE2-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10 -; SSE2-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10 -; SSE2-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11 -; SSE2-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11 -; SSE2-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12 -; SSE2-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12 -; SSE2-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13 -; SSE2-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13 -; SSE2-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14 -; SSE2-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14 -; SSE2-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15 -; SSE2-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15 -; SSE2-NEXT: ret <16 x i16> [[V15]] +; SSE2-NEXT: ret <16 x i16> [[TMP3]] ; ; SLM-LABEL: @loadext_16i8_to_16i16( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -519,55 +320,10 @@ ; SLM-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 ; SLM-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 ; SLM-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[I8:%.*]] = load i8, i8* [[P8]], align 1 -; SLM-NEXT: [[I9:%.*]] = load i8, i8* [[P9]], align 1 -; SLM-NEXT: [[I10:%.*]] = load i8, i8* [[P10]], align 1 -; SLM-NEXT: [[I11:%.*]] = load i8, i8* [[P11]], align 1 -; SLM-NEXT: [[I12:%.*]] = load i8, i8* [[P12]], align 1 -; SLM-NEXT: [[I13:%.*]] = load i8, i8* [[P13]], align 1 -; SLM-NEXT: [[I14:%.*]] = load i8, i8* [[P14]], align 1 -; SLM-NEXT: [[I15:%.*]] = load i8, i8* [[P15]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i16 -; SLM-NEXT: [[X8:%.*]] = zext i8 [[I8]] to i16 -; SLM-NEXT: [[X9:%.*]] = zext i8 [[I9]] to i16 -; SLM-NEXT: [[X10:%.*]] = zext i8 [[I10]] to i16 -; SLM-NEXT: [[X11:%.*]] = zext i8 [[I11]] to i16 -; SLM-NEXT: [[X12:%.*]] = zext i8 [[I12]] to i16 -; SLM-NEXT: [[X13:%.*]] = zext i8 [[I13]] to i16 -; SLM-NEXT: [[X14:%.*]] = zext i8 [[I14]] to i16 -; SLM-NEXT: [[X15:%.*]] = zext i8 [[I15]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7 -; SLM-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8 -; SLM-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9 -; SLM-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10 -; SLM-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11 -; SLM-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12 -; SLM-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13 -; SLM-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14 -; SLM-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15 -; SLM-NEXT: ret <16 x i16> [[V15]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16> +; SLM-NEXT: ret <16 x i16> [[TMP3]] ; ; AVX-LABEL: @loadext_16i8_to_16i16( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -588,39 +344,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3 -; AVX-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4 -; AVX-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4 -; AVX-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5 -; AVX-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5 -; AVX-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6 -; AVX-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7 -; AVX-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7 -; AVX-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8 -; AVX-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8 -; AVX-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9 -; AVX-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9 -; AVX-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10 -; AVX-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10 -; AVX-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11 -; AVX-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11 -; AVX-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12 -; AVX-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12 -; AVX-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13 -; AVX-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13 -; AVX-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14 -; AVX-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14 -; AVX-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15 -; AVX-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15 -; AVX-NEXT: ret <16 x i16> [[V15]] +; AVX-NEXT: ret <16 x i16> [[TMP3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 @@ -698,32 +422,21 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* ; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 -; SSE2-NEXT: ret <2 x i64> [[V1]] +; SSE2-NEXT: ret <2 x i64> [[TMP3]] ; ; SLM-LABEL: @loadext_2i16_to_2i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SLM-NEXT: ret <2 x i64> [[V1]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> +; SLM-NEXT: ret <2 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_2i16_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: ret <2 x i64> [[V1]] +; AVX-NEXT: ret <2 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %i0 = load i16, i16* %p0, align 1 @@ -743,33 +456,16 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* ; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: ret <4 x i32> [[V3]] +; SSE2-NEXT: ret <4 x i32> [[TMP3]] ; ; SLM-LABEL: @loadext_4i16_to_4i32( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: ret <4 x i32> [[V3]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> +; SLM-NEXT: ret <4 x i32> [[TMP3]] ; ; AVX-LABEL: @loadext_4i16_to_4i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -778,15 +474,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i32> [[V3]] +; AVX-NEXT: ret <4 x i32> [[TMP3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -814,33 +502,16 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* ; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; SSE2-NEXT: ret <4 x i64> [[V3]] +; SSE2-NEXT: ret <4 x i64> [[TMP3]] ; ; SLM-LABEL: @loadext_4i16_to_4i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64 -; SLM-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64 -; SLM-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SLM-NEXT: ret <4 x i64> [[V3]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> +; SLM-NEXT: ret <4 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_4i16_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -849,15 +520,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -889,23 +552,7 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* ; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; SSE2-NEXT: ret <8 x i32> [[V7]] +; SSE2-NEXT: ret <8 x i32> [[TMP3]] ; ; SLM-LABEL: @loadext_8i16_to_8i32( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -915,31 +562,10 @@ ; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 ; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 ; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i16, i16* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i16, i16* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i16, i16* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i16, i16* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i32 -; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i32 -; SLM-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i32 -; SLM-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i32 -; SLM-NEXT: [[X4:%.*]] = zext i16 [[I4]] to i32 -; SLM-NEXT: [[X5:%.*]] = zext i16 [[I5]] to i32 -; SLM-NEXT: [[X6:%.*]] = zext i16 [[I6]] to i32 -; SLM-NEXT: [[X7:%.*]] = zext i16 [[I7]] to i32 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 -; SLM-NEXT: ret <8 x i32> [[V7]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP3]] ; ; AVX-LABEL: @loadext_8i16_to_8i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -952,23 +578,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 -; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 -; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 -; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 -; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 -; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 -; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 -; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 -; AVX-NEXT: ret <8 x i32> [[V7]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 @@ -1014,32 +624,21 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* ; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 -; SSE2-NEXT: ret <2 x i64> [[V1]] +; SSE2-NEXT: ret <2 x i64> [[TMP3]] ; ; SLM-LABEL: @loadext_2i32_to_2i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; SLM-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SLM-NEXT: ret <2 x i64> [[V1]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> +; SLM-NEXT: ret <2 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_2i32_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: ret <2 x i64> [[V1]] +; AVX-NEXT: ret <2 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i32, i32* %p0, i64 1 %i0 = load i32, i32* %p0, align 1 @@ -1059,33 +658,16 @@ ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* ; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> -; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; SSE2-NEXT: ret <4 x i64> [[V3]] +; SSE2-NEXT: ret <4 x i64> [[TMP3]] ; ; SLM-LABEL: @loadext_4i32_to_4i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; SLM-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64 -; SLM-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64 -; SLM-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64 -; SLM-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64 -; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SLM-NEXT: ret <4 x i64> [[V3]] +; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> +; SLM-NEXT: ret <4 x i64> [[TMP3]] ; ; AVX-LABEL: @loadext_4i32_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 @@ -1094,15 +676,7 @@ ; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 -; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 -; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 -; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 -; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 -; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 -; AVX-NEXT: ret <4 x i64> [[V3]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %p1 = getelementptr inbounds i32, i32* %p0, i64 1 %p2 = getelementptr inbounds i32, i32* %p0, i64 2 diff --git a/llvm/test/Transforms/SLPVectorizer/vectorizable-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/vectorizable-functions-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/vectorizable-functions-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/vectorizable-functions-inseltpoison.ll @@ -9,15 +9,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vmemread(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/vectorizable-functions.ll b/llvm/test/Transforms/SLPVectorizer/vectorizable-functions.ll --- a/llvm/test/Transforms/SLPVectorizer/vectorizable-functions.ll +++ b/llvm/test/Transforms/SLPVectorizer/vectorizable-functions.ll @@ -9,15 +9,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vmemread(<4 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 -; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3 -; CHECK-NEXT: ret <4 x float> [[VECINS_3]] +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16