Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -585,7 +585,7 @@ ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); - NumOpsWantToKeepOrder.clear(); + NumOpsWantToKeepOrder = 0; for (auto &Iter : BlocksSchedules) { BlockScheduling *BS = Iter.second.get(); BS->clear(); @@ -599,14 +599,7 @@ void optimizeGatherSequence(); /// \returns true if it is beneficial to reverse the vector order. - bool shouldReorder() const { - return std::accumulate( - NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), 0, - [](int Val1, - const decltype(NumOpsWantToKeepOrder)::value_type &Val2) { - return Val1 + (Val2.second < 0 ? 1 : -1); - }) > 0; - } + bool shouldReorder() const { return NumOpsWantToKeepOrder < 0; } /// \return The vector element size in bits to use when vectorizing the /// expression tree ending at \p V. If V is a store, the size is the width of @@ -1205,7 +1198,7 @@ /// Number of operation bundles that contain consecutive operations - number /// of operation bundles that contain consecutive operations in reversed /// order. - DenseMap NumOpsWantToKeepOrder; + int NumOpsWantToKeepOrder; // Analysis and block reference. Function *F; @@ -1560,11 +1553,11 @@ bool Reuse = canReuseExtract(VL, VL0); if (Reuse) { DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); - ++NumOpsWantToKeepOrder[S.Opcode]; + ++NumOpsWantToKeepOrder; } else { SmallVector ReverseVL(VL.rbegin(), VL.rend()); if (canReuseExtract(ReverseVL, VL0)) - --NumOpsWantToKeepOrder[S.Opcode]; + --NumOpsWantToKeepOrder; BS.cancelScheduling(VL, VL0); } newTreeEntry(VL, Reuse, UserTreeIdx, ReuseShuffleIndicies); @@ -1614,7 +1607,7 @@ } if (Consecutive) { - ++NumOpsWantToKeepOrder[S.Opcode]; + ++NumOpsWantToKeepOrder; newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a vector of loads.\n"); return; @@ -1630,7 +1623,7 @@ } if (ReverseConsecutive) { - --NumOpsWantToKeepOrder[S.Opcode]; + --NumOpsWantToKeepOrder; newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a vector of reversed loads.\n"); return; @@ -1797,22 +1790,54 @@ } case Instruction::Store: { // Check if the stores are consecutive or of we need to swizzle them. - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); - DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); - return; + bool IsConsecutive = true; + bool IsReverseConsecutive = true; + for (unsigned I = 0, E = VL.size() - 1; I < E; ++I) { + if (isConsecutiveAccess(VL[I], VL[I + 1], *DL, *SE)) + IsReverseConsecutive = false; + else { + IsConsecutive = false; + break; } + } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); - DEBUG(dbgs() << "SLP: added a vector of stores.\n"); + if (IsConsecutive) { + ++NumOpsWantToKeepOrder; + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + DEBUG(dbgs() << "SLP: added a vector of stores.\n"); + ValueList Operands; + for (Value *V : VL) + Operands.push_back(cast(V)->getOperand(0)); + + buildTree_rec(Operands, Depth + 1, UserTreeIdx); + return; + } + + if (IsReverseConsecutive) { + for (unsigned I = VL.size(); I > 1; --I) { + if (!isConsecutiveAccess(VL[I - 1], VL[I - 2], *DL, *SE)) { + IsReverseConsecutive = false; + break; + } + } + if (IsReverseConsecutive) { + // Reversed consecutive order. + --NumOpsWantToKeepOrder; + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + DEBUG(dbgs() << "SLP: added a vector of reversed stores.\n"); + + ValueList Operands; + for (Value *V : VL) + Operands.push_back(cast(V)->getOperand(0)); - ValueList Operands; - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(0)); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); + return; + } + } - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } case Instruction::Call: { @@ -2254,7 +2279,8 @@ } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. - unsigned alignment = dyn_cast(VL0)->getAlignment(); + auto *SI = cast(VL0); + unsigned alignment = SI->getAlignment(); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * TTI->getMemoryOpCost(Instruction::Store, ScalarTy, @@ -2264,6 +2290,10 @@ TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0); int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0); + if (!isConsecutiveAccess(VL[0], VL[1], *DL, *SE)) { + VecStCost += TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } return ReuseShuffleCost + VecStCost - ScalarStCost; } case Instruction::Call: { @@ -3245,7 +3275,9 @@ return V; } case Instruction::Store: { - StoreInst *SI = cast(VL0); + bool IsConsecutive = + isConsecutiveAccess(E->Scalars[0], E->Scalars[1], *DL, *SE); + StoreInst *SI = cast(IsConsecutive ? VL0 : E->Scalars.back()); unsigned Alignment = SI->getAlignment(); unsigned AS = SI->getPointerAddressSpace(); @@ -3256,6 +3288,12 @@ setInsertPointAfterBundle(E->Scalars, VL0); Value *VecValue = vectorizeTree(ScalarStoreValues); + if (!IsConsecutive) { + SmallVector Mask(E->Scalars.size()); + std::iota(Mask.rbegin(), Mask.rend(), 0); + VecValue = Builder.CreateShuffleVector( + VecValue, UndefValue::get(VecValue->getType()), Mask); + } Value *ScalarPtr = SI->getPointerOperand(); Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); StoreInst *S = Builder.CreateStore(VecValue, VecPtr); @@ -4570,6 +4608,10 @@ ArrayRef Operands = Chain.slice(i, VF); R.buildTree(Operands); + if (R.shouldReorder()) { + SmallVector Reversed(Operands.rbegin(), Operands.rend()); + R.buildTree(Reversed); + } if (R.isTreeTinyAndNotFullyVectorizable()) continue; Index: test/Transforms/SLPVectorizer/X86/stores_vectorize.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/stores_vectorize.ll +++ test/Transforms/SLPVectorizer/X86/stores_vectorize.ll @@ -92,15 +92,14 @@ ; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[P3]] to <4 x i64>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> undef, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 11 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ARRAYIDX1]] to <4 x i64>* -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[ARRAYIDX1]] to <4 x i64>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 4 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX14]] to <4 x i64>* -; CHECK-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* [[TMP7]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX14]] to <4 x i64>* +; CHECK-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* [[TMP6]], align 8 ; CHECK-NEXT: ret void ; entry: