Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -667,6 +667,20 @@ const ValueToValueMap &StridesMap = ValueToValueMap(), bool Assume = false, bool ShouldCheckWrap = true); +/// \brief Attempt to sort the pointers in \p VL and return the sorted indices +/// in \p SortedIndices, if reordering is required. +/// +/// Returns 'false' if sorting is not legal, otherwise returns 'true'. +/// +/// For example, for a given \p VL of memory accesses in program order, a[i+2], +/// a[i+0], a[i+1] and a[i+3], this function will sort the \p VL and save the +/// sorted indices in \p SortedIndices as a[i+0], a[i+1], a[i+2], a[i+3] and +/// saves the mask for actual memory accesses in program order in +/// \p SortedIndices as <2,0,1,3> +bool sortPtrAccesses(ArrayRef VL, const DataLayout &DL, + ScalarEvolution &SE, + SmallVectorImpl &SortedIndices); + /// \brief Returns true if the memory operations \p A and \p B are consecutive. /// This is a simple API that does not depend on the analysis pass. bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -1087,6 +1087,57 @@ return Stride; } +bool llvm::sortPtrAccesses(ArrayRef VL, const DataLayout &DL, + ScalarEvolution &SE, + SmallVectorImpl &SortedIndices) { + assert(llvm::all_of( + VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && + "Expected list of pointer operands."); + SmallVector, 4> OffValPairs; + OffValPairs.reserve(VL.size()); + + // Walk over the pointers, and map each of them to an offset relative to + // first pointer in the array. + Value *Ptr0 = VL[0]; + const SCEV *Scev0 = SE.getSCEV(Ptr0); + Value *Obj0 = GetUnderlyingObject(Ptr0, DL); + + for (auto *Ptr : VL) { + // If a pointer refers to a different underlying object, bail - the + // pointers are by definition incomparable. + Value *CurrObj = GetUnderlyingObject(Ptr, DL); + if (CurrObj != Obj0) + return false; + + const SCEV *Scev = SE.getSCEV(Ptr); + const auto *Diff = dyn_cast(SE.getMinusSCEV(Scev, Scev0)); + // The pointers may not have a constant offset from each other, or SCEV + // may just not be smart enough to figure out they do. Regardless, + // there's nothing we can do. + if (!Diff) + return false; + + OffValPairs.emplace_back(Diff->getAPInt().getSExtValue(), Ptr); + } + SortedIndices.clear(); + SortedIndices.resize(VL.size()); + std::iota(SortedIndices.begin(), SortedIndices.end(), 0); + + // Sort the memory accesses and keep the order of their uses in UseOrder. + std::stable_sort(SortedIndices.begin(), SortedIndices.end(), + [&OffValPairs](unsigned Left, unsigned Right) { + return OffValPairs[Left].first < OffValPairs[Right].first; + }); + + // Check if the order is consecutive already. + if (llvm::all_of(SortedIndices, [&SortedIndices](const unsigned I) { + return I == SortedIndices[I]; + })) + SortedIndices.clear(); + + return true; +} + /// Take the pointer operand from the Load/Store instruction. /// Returns NULL if this is not a valid Load/Store instruction. static Value *getPointerOperand(Value *I) { Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -452,16 +452,20 @@ } /// \returns True if Extract{Value,Element} instruction extracts element Idx. -static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) { +static Optional getExtractIndex(Instruction *E) { + unsigned Opcode = E->getOpcode(); assert(Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue); if (Opcode == Instruction::ExtractElement) { - ConstantInt *CI = dyn_cast(E->getOperand(1)); - return CI && CI->getZExtValue() == Idx; - } else { - ExtractValueInst *EI = cast(E); - return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx; + auto *CI = dyn_cast(E->getOperand(1)); + if (!CI) + return None; + return CI->getZExtValue(); } + ExtractValueInst *EI = cast(E); + if (EI->getNumIndices() != 1) + return None; + return *EI->idx_begin(); } /// \returns True if in-tree use also needs extract. This refers to @@ -586,6 +590,7 @@ MustGather.clear(); ExternalUses.clear(); NumOpsWantToKeepOrder.clear(); + DirectOrderNum = 0; for (auto &Iter : BlocksSchedules) { BlockScheduling *BS = Iter.second.get(); BS->clear(); @@ -598,14 +603,18 @@ /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); - /// \returns true if it is beneficial to reverse the vector order. - bool shouldReorder() const { - return std::accumulate( - NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), 0, - [](int Val1, - const decltype(NumOpsWantToKeepOrder)::value_type &Val2) { - return Val1 + (Val2.second < 0 ? 1 : -1); - }) > 0; + /// \returns The best order of instructions for vectorization. + Optional> bestOrder() const { + auto I = std::max_element( + NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), + [](const decltype(NumOpsWantToKeepOrder)::value_type &D1, + const decltype(NumOpsWantToKeepOrder)::value_type &D2) { + return D1.second < D2.second; + }); + if (I == NumOpsWantToKeepOrder.end() || I->getSecond() <= DirectOrderNum) + return None; + + return makeArrayRef(I->getFirst()); } /// \return The vector element size in bits to use when vectorizing the @@ -652,9 +661,13 @@ /// This is the recursive part of buildTree. void buildTree_rec(ArrayRef Roots, unsigned Depth, int); - /// \returns True if the ExtractElement/ExtractValue instructions in VL can - /// be vectorized to use the original vector (or aggregate "bitcast" to a vector). - bool canReuseExtract(ArrayRef VL, Value *OpValue) const; + /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can + /// be vectorized to use the original vector (or aggregate "bitcast" to a + /// vector) and sets \p CurrentOrder to the identity permutation; otherwise + /// returns false, setting \p CurrentOrder to either an empty vector or a + /// non-identity permutation that allows to reuse extract instructions. + bool canReuseExtract(ArrayRef VL, Value *OpValue, + SmallVectorImpl &CurrentOrder) const; /// Vectorize a single entry in the tree. Value *vectorizeTree(TreeEntry *E); @@ -718,6 +731,9 @@ /// Does this sequence require some shuffling? SmallVector ReuseShuffleIndices; + /// Does this entry require reordering? + ArrayRef ReorderIndices; + /// Points back to the VectorizableTree. /// /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has @@ -733,7 +749,8 @@ /// Create a new VectorizableTree entry. void newTreeEntry(ArrayRef VL, bool Vectorized, int &UserTreeIdx, - ArrayRef ReuseShuffleIndices = None) { + ArrayRef ReuseShuffleIndices = None, + ArrayRef ReorderIndices = None) { VectorizableTree.emplace_back(VectorizableTree); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; @@ -741,6 +758,7 @@ Last->NeedToGather = !Vectorized; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); + Last->ReorderIndices = ReorderIndices; if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); @@ -1202,10 +1220,36 @@ /// List of users to ignore during scheduling and that don't need extracting. ArrayRef UserIgnoreList; - /// Number of operation bundles that contain consecutive operations - number - /// of operation bundles that contain consecutive operations in reversed - /// order. - DenseMap NumOpsWantToKeepOrder; + using OrdersType = SmallVector; + /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of + /// sorted SmallVectors of unsigned. + struct OrdersTypeDenseMapInfo { + static OrdersType getEmptyKey() { + OrdersType V; + V.push_back(~1U); + return V; + } + + static OrdersType getTombstoneKey() { + OrdersType V; + V.push_back(~2U); + return V; + } + + static unsigned getHashValue(const OrdersType &V) { + return static_cast(hash_combine_range(V.begin(), V.end())); + } + + static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) { + return LHS == RHS; + } + }; + + /// Contains orders of operations along with the number of bundles that have + /// operations in this order. + DenseMap NumOpsWantToKeepOrder; + /// Number of bundles that do not require reordering. + unsigned DirectOrderNum = 0; // Analysis and block reference. Function *F; @@ -1557,17 +1601,29 @@ } case Instruction::ExtractValue: case Instruction::ExtractElement: { - bool Reuse = canReuseExtract(VL, VL0); + OrdersType CurrentOrder; + bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); if (Reuse) { DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); - ++NumOpsWantToKeepOrder[S.Opcode]; - } else { - SmallVector ReverseVL(VL.rbegin(), VL.rend()); - if (canReuseExtract(ReverseVL, VL0)) - --NumOpsWantToKeepOrder[S.Opcode]; - BS.cancelScheduling(VL, VL0); + ++DirectOrderNum; + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + ReuseShuffleIndicies); + return; } - newTreeEntry(VL, Reuse, UserTreeIdx, ReuseShuffleIndicies); + if (!CurrentOrder.empty()) { + DEBUG(dbgs() + << "SLP: Reusing or shuffling of reordered extract sequence.\n"); + // Insert new order with initial value 0, if it does not exist, + // otherwise return the iterator to the existing one. + auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; + ++I->getSecond(); + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, + I->getFirst()); + return; + } + DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); + newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL, VL0); return; } case Instruction::Load: { @@ -1589,51 +1645,54 @@ // Make sure all loads in the bundle are simple - we can't vectorize // atomic or volatile loads. - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { - LoadInst *L = cast(VL[i]); + SmallVector PointerOps; + PointerOps.reserve(VL.size()); + for (Value *V : VL) { + auto *L = cast(V); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } + PointerOps.emplace_back(L->getPointerOperand()); } - // Check if the loads are consecutive, reversed, or neither. - // TODO: What we really want is to sort the loads, but for now, check - // the two likely directions. - bool Consecutive = true; - bool ReverseConsecutive = true; - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { - if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { - Consecutive = false; - break; + OrdersType CurrentOrder; + // Check the order of pointer operands. + if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) { + Value *Ptr0; + Value *PtrN; + if (CurrentOrder.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); } else { - ReverseConsecutive = false; + Ptr0 = PointerOps[CurrentOrder.front()]; + PtrN = PointerOps[CurrentOrder.back()]; } - } - - if (Consecutive) { - ++NumOpsWantToKeepOrder[S.Opcode]; - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); - DEBUG(dbgs() << "SLP: added a vector of loads.\n"); - return; - } - - // If none of the load pairs were consecutive when checked in order, - // check the reverse order. - if (ReverseConsecutive) - for (unsigned i = VL.size() - 1; i > 0; --i) - if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) { - ReverseConsecutive = false; - break; + const SCEV *Scev0 = SE->getSCEV(Ptr0); + const SCEV *ScevN = SE->getSCEV(PtrN); + const auto *Diff = + dyn_cast(SE->getMinusSCEV(ScevN, Scev0)); + uint64_t Size = DL->getTypeAllocSize(ScalarTy); + // Check that the sorted loads are consecutive. + if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) { + if (CurrentOrder.empty()) { + // Original loads are consecutive and does not require reordering. + ++DirectOrderNum; + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + ReuseShuffleIndicies); + DEBUG(dbgs() << "SLP: added a vector of loads.\n"); + } else { + // Need to reorder. + auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; + ++I->getSecond(); + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + ReuseShuffleIndicies, I->getFirst()); + DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } - - if (ReverseConsecutive) { - --NumOpsWantToKeepOrder[S.Opcode]; - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); - DEBUG(dbgs() << "SLP: added a vector of reversed loads.\n"); - return; + return; + } } DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); @@ -1944,7 +2003,8 @@ return N; } -bool BoUpSLP::canReuseExtract(ArrayRef VL, Value *OpValue) const { +bool BoUpSLP::canReuseExtract(ArrayRef VL, Value *OpValue, + SmallVectorImpl &CurrentOrder) const { Instruction *E0 = cast(OpValue); assert(E0->getOpcode() == Instruction::ExtractElement || E0->getOpcode() == Instruction::ExtractValue); @@ -1953,6 +2013,8 @@ // correct offset. Value *Vec = E0->getOperand(0); + CurrentOrder.clear(); + // We have to extract from a vector/aggregate with the same number of elements. unsigned NElts; if (E0->getOpcode() == Instruction::ExtractValue) { @@ -1972,15 +2034,44 @@ return false; // Check that all of the indices extract from the correct offset. - for (unsigned I = 0, E = VL.size(); I < E; ++I) { - Instruction *Inst = cast(VL[I]); - if (!matchExtractIndex(Inst, I, Inst->getOpcode())) + bool ShouldKeepOrder = true; + unsigned E = VL.size(); + // Assign initial value to of all items to E + 1 so we can check if the + // extract instruction index was used already. + // Also, later we can check that all the indices are used and we have a + // consecutive access in the extract instructions (if at least one element of + // the BestOrder is still E + 1, we don't have consecutive extract + // instructions). + CurrentOrder.assign(VL.size(), E + 1); + for (unsigned I = 0; I < E; ++I) { + auto *Inst = cast(VL[I]); + if (Inst->getOperand(0) != Vec) { + CurrentOrder.clear(); return false; - if (Inst->getOperand(0) != Vec) + } + Optional Idx = getExtractIndex(Inst); + if (!Idx) { + CurrentOrder.clear(); return false; + } + const unsigned ExtIdx = *Idx; + if (ExtIdx != I) { + if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1) { + CurrentOrder.clear(); + return false; + } + ShouldKeepOrder = false; + CurrentOrder[ExtIdx] = I; + } else { + if (CurrentOrder[I] != E + 1) { + CurrentOrder.clear(); + return false; + } + CurrentOrder[I] = I; + } } - return true; + return ShouldKeepOrder; } bool BoUpSLP::areAllUsersVectorized(Instruction *I) const { @@ -2082,8 +2173,13 @@ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } } - if (canReuseExtract(VL, S.OpValue)) { + if (!E->NeedToGather) { int DeadCost = ReuseShuffleCost; + if (!E->ReorderIndices.empty()) { + // TODO: Merge this shuffle with the ReuseShuffleCost. + DeadCost += TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } for (unsigned i = 0, e = VL.size(); i < e; ++i) { Instruction *E = cast(VL[i]); // If all users are going to be vectorized, instruction can be @@ -2246,7 +2342,8 @@ TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0); - if (!isConsecutiveAccess(VL[0], VL[1], *DL, *SE)) { + if (!E->ReorderIndices.empty()) { + // TODO: Merge this shuffle with the ReuseShuffleCost. VecLdCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } @@ -2923,6 +3020,15 @@ return V; } +static void inversePermutation(ArrayRef Indices, + SmallVectorImpl &Mask) { + Mask.clear(); + const unsigned E = Indices.size(); + Mask.resize(Indices.size()); + for (unsigned I = 0; I < E; ++I) + Mask[Indices[I]] = I; +} + Value *BoUpSLP::vectorizeTree(TreeEntry *E) { IRBuilder<>::InsertPointGuard Guard(Builder); @@ -2999,10 +3105,19 @@ } case Instruction::ExtractElement: { - if (canReuseExtract(E->Scalars, VL0)) { + if (!E->NeedToGather) { Value *V = VL0->getOperand(0); - if (NeedToShuffleReuses) { + if (!E->ReorderIndices.empty()) { + OrdersType Mask; + inversePermutation(E->ReorderIndices, Mask); Builder.SetInsertPoint(VL0); + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask, + "reorder_shuffle"); + } + if (NeedToShuffleReuses) { + // TODO: Merge this shuffle with the ReorderShuffleMask. + if (!E->ReorderIndices.empty()) + Builder.SetInsertPoint(VL0); V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); } @@ -3023,14 +3138,21 @@ return V; } case Instruction::ExtractValue: { - if (canReuseExtract(E->Scalars, VL0)) { + if (!E->NeedToGather) { LoadInst *LI = cast(VL0->getOperand(0)); Builder.SetInsertPoint(LI); PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment()); Value *NewV = propagateMetadata(V, E->Scalars); + if (!E->ReorderIndices.empty()) { + OrdersType Mask; + inversePermutation(E->ReorderIndices, Mask); + NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask, + "reorder_shuffle"); + } if (NeedToShuffleReuses) { + // TODO: Merge this shuffle with the ReorderShuffleMask. NewV = Builder.CreateShuffleVector( NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); } @@ -3204,10 +3326,9 @@ case Instruction::Load: { // Loads are inserted at the head of the tree because we don't want to // sink them all the way down past store instructions. - bool IsReversed = - !isConsecutiveAccess(E->Scalars[0], E->Scalars[1], *DL, *SE); - if (IsReversed) - VL0 = cast(E->Scalars.back()); + bool IsReorder = !E->ReorderIndices.empty(); + if (IsReorder) + VL0 = cast(E->Scalars[E->ReorderIndices.front()]); setInsertPointAfterBundle(E->Scalars, VL0); LoadInst *LI = cast(VL0); @@ -3231,12 +3352,15 @@ } LI->setAlignment(Alignment); Value *V = propagateMetadata(LI, E->Scalars); - if (IsReversed) { - SmallVector Mask(E->Scalars.size()); - std::iota(Mask.rbegin(), Mask.rend(), 0); - V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask); + if (IsReorder) { + SmallVector Mask(E->Scalars.size()); + for (unsigned I = 0, End = E->Scalars.size(); I < End; ++I) + Mask[E->ReorderIndices[I]] = I; + V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), + Mask, "reorder_shuffle"); } if (NeedToShuffleReuses) { + // TODO: Merge this shuffle with the ReorderShuffleMask. V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); } @@ -4815,8 +4939,9 @@ ArrayRef Ops = VL.slice(I, OpsWidth); R.buildTree(Ops); + Optional> Order = R.bestOrder(); // TODO: check if we can allow reordering for more cases. - if (AllowReorder && R.shouldReorder()) { + if (AllowReorder && Order) { // Conceptually, there is nothing actually preventing us from trying to // reorder a larger list. In fact, we do exactly this when vectorizing // reductions. However, at this point, we only expect to get here when @@ -5562,9 +5687,13 @@ while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); V.buildTree(VL, ExternallyUsedValues, IgnoreList); - if (V.shouldReorder()) { - SmallVector Reversed(VL.rbegin(), VL.rend()); - V.buildTree(Reversed, ExternallyUsedValues, IgnoreList); + Optional> Order = V.bestOrder(); + if (Order) { + SmallVector ReorderedOps; + ReorderedOps.reserve(VL.size()); + for (const unsigned Idx : *Order) + ReorderedOps.emplace_back(VL[Idx]); + V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList); } if (V.isTreeTinyAndNotFullyVectorizable()) break; Index: test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll +++ test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll @@ -10,15 +10,16 @@ ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 [[IDX]], i64 6 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 [[IDX]], i64 7 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 [[IDX]], i64 8 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 2 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP10]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 3 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i32 3 ; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[SINK:%.*]] ; CHECK-NEXT: ret void Index: test/Transforms/SLPVectorizer/X86/extract.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/extract.ll +++ test/Transforms/SLPVectorizer/X86/extract.ll @@ -30,14 +30,11 @@ ; CHECK-LABEL: @fextr1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LD:%.*]] = load <2 x double>, <2 x double>* undef -; CHECK-NEXT: [[V0:%.*]] = extractelement <2 x double> [[LD]], i32 0 -; CHECK-NEXT: [[V1:%.*]] = extractelement <2 x double> [[LD]], i32 1 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x double> [[LD]], <2 x double> undef, <2 x i32> ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V1]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V0]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> , [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[P1]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = fadd <2 x double> , [[REORDER_SHUFFLE]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[P1]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP0]], <2 x double>* [[TMP1]], align 4 ; CHECK-NEXT: ret void ; entry: Index: test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll +++ test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll @@ -11,21 +11,16 @@ define i32 @fn1() { ; CHECK-LABEL: @fn1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 0), align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 1) to <2 x i32>*), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 3), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 8, i32 3 -; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP12]], <4 x i32> -; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[REORDER_SHUFFLE]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 8, i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 ; CHECK-NEXT: ret i32 0 ; entry: Index: test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll +++ test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll @@ -21,28 +21,21 @@ ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 10 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 11 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A]] to <2 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP12:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP11]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <4 x i32> [[TMP1]], [[TMP4]] ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 ; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[B]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[B]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -83,28 +76,21 @@ ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 10 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 11 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A]] to <2 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP12:%.*]] = mul nsw <4 x i32> [[TMP11]], [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP1]] ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 ; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[B]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[B]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: ret void ; entry: Index: test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll +++ test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll @@ -48,11 +48,11 @@ ; CHECK-NEXT: [[ARRAYIDX65:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 ; CHECK-NEXT: [[ARRAYIDX66:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[B]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP34:%.*]], <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP27:%.*]], <4 x i32>* [[TMP1]], align 4 ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ undef, [[ENTRY]] ], [ [[TMP34]], [[FOR_INC]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ undef, [[ENTRY]] ], [ [[TMP27]], [[FOR_INC]] ] ; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] @@ -103,23 +103,16 @@ ; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP21]] -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[ARRAYIDX49]] to <2 x i32>* -; CHECK-NEXT: [[TMP23:%.*]] = load <2 x i32>, <2 x i32>* [[TMP22]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX55]], align 4 -; CHECK-NEXT: [[TMP26:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX58:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP26]] -; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[ARRAYIDX58]], align 4 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <4 x i32> undef, i32 [[TMP28]], i32 0 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP30]], i32 1 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP25]], i32 2 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP27]], i32 3 +; CHECK-NEXT: [[TMP22:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3 +; CHECK-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP22]] +; CHECK-NEXT: [[TMP23:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX58:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP23]] +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX49]] to <4 x i32>* +; CHECK-NEXT: [[TMP25:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: br label [[FOR_INC]] ; CHECK: for.inc: -; CHECK-NEXT: [[TMP34]] = phi <4 x i32> [ [[TMP7]], [[IF_THEN]] ], [ [[TMP13]], [[IF_THEN14]] ], [ [[TMP19]], [[IF_THEN30]] ], [ [[TMP33]], [[IF_THEN46]] ], [ [[TMP2]], [[IF_ELSE43]] ] +; CHECK-NEXT: [[TMP27]] = phi <4 x i32> [ [[TMP7]], [[IF_THEN]] ], [ [[TMP13]], [[IF_THEN14]] ], [ [[TMP19]], [[IF_THEN30]] ], [ [[TMP26]], [[IF_THEN46]] ], [ [[TMP2]], [[IF_ELSE43]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] Index: test/Transforms/SLPVectorizer/X86/jumbled-load.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/jumbled-load.ll +++ test/Transforms/SLPVectorizer/X86/jumbled-load.ll @@ -6,33 +6,26 @@ define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) { ; CHECK-LABEL: @jumbled-load( ; CHECK-NEXT: [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0 -; CHECK-NEXT: [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4 ; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3 -; CHECK-NEXT: [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4 ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1 -; CHECK-NEXT: [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4 ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 -; CHECK-NEXT: [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0 -; CHECK-NEXT: [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4 ; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2 -; CHECK-NEXT: [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4 ; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3 -; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4 ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1 -; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4 -; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[LOAD_3]], [[LOAD_5]] -; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_8]] -; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[LOAD_4]], [[LOAD_7]] -; CHECK-NEXT: [[MUL_4:%.*]] = mul i32 [[LOAD_1]], [[LOAD_6]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>* +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]] ; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 -; CHECK-NEXT: store i32 [[MUL_1]], i32* [[GEP_7]], align 4 ; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1 -; CHECK-NEXT: store i32 [[MUL_2]], i32* [[GEP_8]], align 4 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 -; CHECK-NEXT: store i32 [[MUL_3]], i32* [[GEP_9]], align 4 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 -; CHECK-NEXT: store i32 [[MUL_4]], i32* [[GEP_10]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0 @@ -71,25 +64,27 @@ define i32 @jumbled-load-multiuses(i32* noalias nocapture %in, i32* noalias nocapture %out) { ; CHECK-LABEL: @jumbled-load-multiuses( ; CHECK-NEXT: [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0 -; CHECK-NEXT: [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4 ; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3 -; CHECK-NEXT: [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4 ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1 -; CHECK-NEXT: [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4 ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 -; CHECK-NEXT: [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4 -; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[LOAD_3]], [[LOAD_4]] -; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_2]] -; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[LOAD_4]], [[LOAD_1]] -; CHECK-NEXT: [[MUL_4:%.*]] = mul i32 [[LOAD_1]], [[LOAD_3]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[REORDER_SHUFFLE]], [[TMP10]] ; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 -; CHECK-NEXT: store i32 [[MUL_1]], i32* [[GEP_7]], align 4 ; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1 -; CHECK-NEXT: store i32 [[MUL_2]], i32* [[GEP_8]], align 4 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 -; CHECK-NEXT: store i32 [[MUL_3]], i32* [[GEP_9]], align 4 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 -; CHECK-NEXT: store i32 [[MUL_4]], i32* [[GEP_10]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0 Index: test/Transforms/SLPVectorizer/X86/reassociated-loads.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/reassociated-loads.ll +++ test/Transforms/SLPVectorizer/X86/reassociated-loads.ll @@ -5,70 +5,49 @@ ; CHECK-LABEL: @Foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <32 x i8>, <32 x i8>* [[__V:%.*]], align 32 -; CHECK-NEXT: [[VECEXT_I_I_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 0 -; CHECK-NEXT: [[VECEXT_I_I_1_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 1 -; CHECK-NEXT: [[ADD_I_1_I:%.*]] = add i8 [[VECEXT_I_I_1_I]], [[VECEXT_I_I_I]] -; CHECK-NEXT: [[VECEXT_I_I_2_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 2 -; CHECK-NEXT: [[ADD_I_2_I:%.*]] = add i8 [[ADD_I_1_I]], [[VECEXT_I_I_2_I]] -; CHECK-NEXT: [[VECEXT_I_I_3_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 3 -; CHECK-NEXT: [[ADD_I_3_I:%.*]] = add i8 [[ADD_I_2_I]], [[VECEXT_I_I_3_I]] -; CHECK-NEXT: [[VECEXT_I_I_4_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 4 -; CHECK-NEXT: [[ADD_I_4_I:%.*]] = add i8 [[ADD_I_3_I]], [[VECEXT_I_I_4_I]] -; CHECK-NEXT: [[VECEXT_I_I_5_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 5 -; CHECK-NEXT: [[ADD_I_5_I:%.*]] = add i8 [[ADD_I_4_I]], [[VECEXT_I_I_5_I]] -; CHECK-NEXT: [[VECEXT_I_I_6_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 6 -; CHECK-NEXT: [[ADD_I_6_I:%.*]] = add i8 [[ADD_I_5_I]], [[VECEXT_I_I_6_I]] -; CHECK-NEXT: [[VECEXT_I_I_7_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 7 -; CHECK-NEXT: [[ADD_I_7_I:%.*]] = add i8 [[ADD_I_6_I]], [[VECEXT_I_I_7_I]] -; CHECK-NEXT: [[VECEXT_I_I_8_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 8 -; CHECK-NEXT: [[ADD_I_8_I:%.*]] = add i8 [[ADD_I_7_I]], [[VECEXT_I_I_8_I]] -; CHECK-NEXT: [[VECEXT_I_I_9_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 9 -; CHECK-NEXT: [[ADD_I_9_I:%.*]] = add i8 [[ADD_I_8_I]], [[VECEXT_I_I_9_I]] -; CHECK-NEXT: [[VECEXT_I_I_10_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 10 -; CHECK-NEXT: [[ADD_I_10_I:%.*]] = add i8 [[ADD_I_9_I]], [[VECEXT_I_I_10_I]] -; CHECK-NEXT: [[VECEXT_I_I_11_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 11 -; CHECK-NEXT: [[ADD_I_11_I:%.*]] = add i8 [[ADD_I_10_I]], [[VECEXT_I_I_11_I]] -; CHECK-NEXT: [[VECEXT_I_I_12_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 12 -; CHECK-NEXT: [[ADD_I_12_I:%.*]] = add i8 [[ADD_I_11_I]], [[VECEXT_I_I_12_I]] -; CHECK-NEXT: [[VECEXT_I_I_13_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 13 -; CHECK-NEXT: [[ADD_I_13_I:%.*]] = add i8 [[ADD_I_12_I]], [[VECEXT_I_I_13_I]] -; CHECK-NEXT: [[VECEXT_I_I_14_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 14 -; CHECK-NEXT: [[ADD_I_14_I:%.*]] = add i8 [[ADD_I_13_I]], [[VECEXT_I_I_14_I]] -; CHECK-NEXT: [[VECEXT_I_I_15_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 15 -; CHECK-NEXT: [[ADD_I_15_I:%.*]] = add i8 [[ADD_I_14_I]], [[VECEXT_I_I_15_I]] -; CHECK-NEXT: [[VECEXT_I_I_16_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 16 -; CHECK-NEXT: [[ADD_I_16_I:%.*]] = add i8 [[ADD_I_15_I]], [[VECEXT_I_I_16_I]] -; CHECK-NEXT: [[VECEXT_I_I_17_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 17 -; CHECK-NEXT: [[ADD_I_17_I:%.*]] = add i8 [[ADD_I_16_I]], [[VECEXT_I_I_17_I]] -; CHECK-NEXT: [[VECEXT_I_I_18_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 18 -; CHECK-NEXT: [[ADD_I_18_I:%.*]] = add i8 [[ADD_I_17_I]], [[VECEXT_I_I_18_I]] -; CHECK-NEXT: [[VECEXT_I_I_19_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 19 -; CHECK-NEXT: [[ADD_I_19_I:%.*]] = add i8 [[ADD_I_18_I]], [[VECEXT_I_I_19_I]] -; CHECK-NEXT: [[VECEXT_I_I_20_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 20 -; CHECK-NEXT: [[ADD_I_20_I:%.*]] = add i8 [[ADD_I_19_I]], [[VECEXT_I_I_20_I]] -; CHECK-NEXT: [[VECEXT_I_I_21_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 21 -; CHECK-NEXT: [[ADD_I_21_I:%.*]] = add i8 [[ADD_I_20_I]], [[VECEXT_I_I_21_I]] -; CHECK-NEXT: [[VECEXT_I_I_22_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 22 -; CHECK-NEXT: [[ADD_I_22_I:%.*]] = add i8 [[ADD_I_21_I]], [[VECEXT_I_I_22_I]] -; CHECK-NEXT: [[VECEXT_I_I_23_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 23 -; CHECK-NEXT: [[ADD_I_23_I:%.*]] = add i8 [[ADD_I_22_I]], [[VECEXT_I_I_23_I]] -; CHECK-NEXT: [[VECEXT_I_I_24_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 24 -; CHECK-NEXT: [[ADD_I_24_I:%.*]] = add i8 [[ADD_I_23_I]], [[VECEXT_I_I_24_I]] -; CHECK-NEXT: [[VECEXT_I_I_25_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 25 -; CHECK-NEXT: [[ADD_I_25_I:%.*]] = add i8 [[ADD_I_24_I]], [[VECEXT_I_I_25_I]] -; CHECK-NEXT: [[VECEXT_I_I_26_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 26 -; CHECK-NEXT: [[ADD_I_26_I:%.*]] = add i8 [[ADD_I_25_I]], [[VECEXT_I_I_26_I]] -; CHECK-NEXT: [[VECEXT_I_I_27_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 27 -; CHECK-NEXT: [[ADD_I_27_I:%.*]] = add i8 [[ADD_I_26_I]], [[VECEXT_I_I_27_I]] -; CHECK-NEXT: [[VECEXT_I_I_28_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 28 -; CHECK-NEXT: [[ADD_I_28_I:%.*]] = add i8 [[ADD_I_27_I]], [[VECEXT_I_I_28_I]] -; CHECK-NEXT: [[VECEXT_I_I_29_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 29 -; CHECK-NEXT: [[ADD_I_29_I:%.*]] = add i8 [[ADD_I_28_I]], [[VECEXT_I_I_29_I]] -; CHECK-NEXT: [[VECEXT_I_I_30_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 30 -; CHECK-NEXT: [[ADD_I_30_I:%.*]] = add i8 [[ADD_I_29_I]], [[VECEXT_I_I_30_I]] -; CHECK-NEXT: [[VECEXT_I_I_31_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 31 -; CHECK-NEXT: [[ADD_I_31_I:%.*]] = add i8 [[ADD_I_30_I]], [[VECEXT_I_I_31_I]] -; CHECK-NEXT: ret i8 [[ADD_I_31_I]] +; CHECK-NEXT: [[ADD_I_1_I:%.*]] = add i8 undef, undef +; CHECK-NEXT: [[ADD_I_2_I:%.*]] = add i8 [[ADD_I_1_I]], undef +; CHECK-NEXT: [[ADD_I_3_I:%.*]] = add i8 [[ADD_I_2_I]], undef +; CHECK-NEXT: [[ADD_I_4_I:%.*]] = add i8 [[ADD_I_3_I]], undef +; CHECK-NEXT: [[ADD_I_5_I:%.*]] = add i8 [[ADD_I_4_I]], undef +; CHECK-NEXT: [[ADD_I_6_I:%.*]] = add i8 [[ADD_I_5_I]], undef +; CHECK-NEXT: [[ADD_I_7_I:%.*]] = add i8 [[ADD_I_6_I]], undef +; CHECK-NEXT: [[ADD_I_8_I:%.*]] = add i8 [[ADD_I_7_I]], undef +; CHECK-NEXT: [[ADD_I_9_I:%.*]] = add i8 [[ADD_I_8_I]], undef +; CHECK-NEXT: [[ADD_I_10_I:%.*]] = add i8 [[ADD_I_9_I]], undef +; CHECK-NEXT: [[ADD_I_11_I:%.*]] = add i8 [[ADD_I_10_I]], undef +; CHECK-NEXT: [[ADD_I_12_I:%.*]] = add i8 [[ADD_I_11_I]], undef +; CHECK-NEXT: [[ADD_I_13_I:%.*]] = add i8 [[ADD_I_12_I]], undef +; CHECK-NEXT: [[ADD_I_14_I:%.*]] = add i8 [[ADD_I_13_I]], undef +; CHECK-NEXT: [[ADD_I_15_I:%.*]] = add i8 [[ADD_I_14_I]], undef +; CHECK-NEXT: [[ADD_I_16_I:%.*]] = add i8 [[ADD_I_15_I]], undef +; CHECK-NEXT: [[ADD_I_17_I:%.*]] = add i8 [[ADD_I_16_I]], undef +; CHECK-NEXT: [[ADD_I_18_I:%.*]] = add i8 [[ADD_I_17_I]], undef +; CHECK-NEXT: [[ADD_I_19_I:%.*]] = add i8 [[ADD_I_18_I]], undef +; CHECK-NEXT: [[ADD_I_20_I:%.*]] = add i8 [[ADD_I_19_I]], undef +; CHECK-NEXT: [[ADD_I_21_I:%.*]] = add i8 [[ADD_I_20_I]], undef +; CHECK-NEXT: [[ADD_I_22_I:%.*]] = add i8 [[ADD_I_21_I]], undef +; CHECK-NEXT: [[ADD_I_23_I:%.*]] = add i8 [[ADD_I_22_I]], undef +; CHECK-NEXT: [[ADD_I_24_I:%.*]] = add i8 [[ADD_I_23_I]], undef +; CHECK-NEXT: [[ADD_I_25_I:%.*]] = add i8 [[ADD_I_24_I]], undef +; CHECK-NEXT: [[ADD_I_26_I:%.*]] = add i8 [[ADD_I_25_I]], undef +; CHECK-NEXT: [[ADD_I_27_I:%.*]] = add i8 [[ADD_I_26_I]], undef +; CHECK-NEXT: [[ADD_I_28_I:%.*]] = add i8 [[ADD_I_27_I]], undef +; CHECK-NEXT: [[ADD_I_29_I:%.*]] = add i8 [[ADD_I_28_I]], undef +; CHECK-NEXT: [[ADD_I_30_I:%.*]] = add i8 [[ADD_I_29_I]], undef +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <32 x i8> [[TMP0]], <32 x i8> undef, <32 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <32 x i8> [[TMP0]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x i8> [[BIN_RDX]], <32 x i8> undef, <32 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <32 x i8> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x i8> [[BIN_RDX2]], <32 x i8> undef, <32 x i32> +; CHECK-NEXT: [[BIN_RDX4:%.*]] = add <32 x i8> [[BIN_RDX2]], [[RDX_SHUF3]] +; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x i8> [[BIN_RDX4]], <32 x i8> undef, <32 x i32> +; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <32 x i8> [[BIN_RDX4]], [[RDX_SHUF5]] +; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x i8> [[BIN_RDX6]], <32 x i8> undef, <32 x i32> +; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <32 x i8> [[BIN_RDX6]], [[RDX_SHUF7]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <32 x i8> [[BIN_RDX8]], i32 0 +; CHECK-NEXT: [[ADD_I_31_I:%.*]] = add i8 [[ADD_I_30_I]], undef +; CHECK-NEXT: ret i8 [[TMP1]] ; entry: %0 = load <32 x i8>, <32 x i8>* %__v, align 32 Index: test/Transforms/SLPVectorizer/X86/store-jumbled.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/store-jumbled.ll +++ test/Transforms/SLPVectorizer/X86/store-jumbled.ll @@ -6,33 +6,26 @@ define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) { ; CHECK-LABEL: @jumbled-load( ; CHECK-NEXT: [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0 -; CHECK-NEXT: [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4 ; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1 -; CHECK-NEXT: [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4 ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 -; CHECK-NEXT: [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4 ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3 -; CHECK-NEXT: [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0 -; CHECK-NEXT: [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4 ; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1 -; CHECK-NEXT: [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4 ; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2 -; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4 ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3 -; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4 -; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[LOAD_1]], [[LOAD_5]] -; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_6]] -; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[LOAD_3]], [[LOAD_7]] -; CHECK-NEXT: [[MUL_4:%.*]] = mul i32 [[LOAD_4]], [[LOAD_8]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>* +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]] ; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 ; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 -; CHECK-NEXT: store i32 [[MUL_1]], i32* [[GEP_9]], align 4 -; CHECK-NEXT: store i32 [[MUL_2]], i32* [[GEP_7]], align 4 -; CHECK-NEXT: store i32 [[MUL_3]], i32* [[GEP_10]], align 4 -; CHECK-NEXT: store i32 [[MUL_4]], i32* [[GEP_8]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0