Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1113,6 +1113,10 @@ /// Sets all instruction in the scheduling region to un-scheduled. void resetSchedule(); + /// Reorder ScheduleData data after scheduling, so that the first instruction + /// in the list would be the last one scheduled. + void reorderBundles(); + BasicBlock *BB; /// Simple memory allocation for ScheduleData. @@ -2869,17 +2873,15 @@ // The last instruction in the bundle in program order. Instruction *LastInst = nullptr; - // Find the last instruction. The common case should be that BB has been - // scheduled, and the last instruction is VL.back(). So we start with - // VL.back() and iterate over schedule data until we reach the end of the - // bundle. The end of the bundle is marked by null ScheduleData. + // Find the last instruction. If the bundle is not scheduled then + // the first in the bundle is the last one in BB, because we discover + // bundles in a backward walk. And if the bundle is already scheduled + // the we reorder one to be the last one scheduled in the first position. if (BlocksSchedules.count(BB)) { auto *Bundle = BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back())); if (Bundle && Bundle->isPartOfBundle()) - for (; Bundle; Bundle = Bundle->NextInBundle) - if (Bundle->OpValue == Bundle->Inst) - LastInst = Bundle->Inst; + LastInst = Bundle->FirstInBundle->Inst; } // LastInst can still be null at this point if there's either not an entry @@ -4190,6 +4192,47 @@ ReadyInsts.clear(); } +void BoUpSLP::BlockScheduling::reorderBundles() { + DenseMap ReorderMap; + // Walk backward in the scheduling region to descover last instruction + // for a bundle. + for (auto *I = ScheduleEnd; I != ScheduleStart; I = I->getPrevNode()) { + doForAllOpcodes(I, [&ReorderMap, &I](ScheduleData *SD) { + if (SD->isPartOfBundle() && + ReorderMap.find(SD->FirstInBundle) == ReorderMap.end()) { + ReorderMap[SD->FirstInBundle] = SD; + } + }); + } + DenseMap::iterator it; + // Swap the last scheduled instruction with the first one in the bundle. + for (it = ReorderMap.begin(); it != ReorderMap.end(); it++) { + ScheduleData *FirstSD = it->first; + ScheduleData *LastSD = it->second; + SmallVector Bundle; + unsigned LastPos = 0; + // The first instruction in the bundle is already the last one scheduled. + if (FirstSD == LastSD) + continue; + ScheduleData *SD = FirstSD; + while (SD) { + if (SD == LastSD) + LastPos = Bundle.size(); + Bundle.push_back(SD); + SD = SD->NextInBundle; + } + std::swap(Bundle[0], Bundle[LastPos]); + for (ScheduleData *SD : Bundle) + SD->FirstInBundle = Bundle[0]; + Bundle[0]->NextInBundle = Bundle[1]; + Bundle[LastPos - 1]->NextInBundle = Bundle[LastPos]; + if (LastPos == Bundle.size() - 1) + Bundle[LastPos]->NextInBundle = nullptr; + else + Bundle[LastPos]->NextInBundle = Bundle[LastPos + 1]; + } +} + void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -4252,6 +4295,7 @@ NumToSchedule--; } assert(NumToSchedule == 0 && "could not schedule all instructions"); + BS->reorderBundles(); // Avoid duplicate scheduling of the block. BS->ScheduleStart = nullptr; Index: test/Transforms/SLPVectorizer/X86/insert-after-bundle1.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/insert-after-bundle1.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -slp-vectorizer < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; For this bundle [ store i32 %sub.i251.5 ...; +; store i32 %sub.i251.4 ...; +; store i32 %sub.i251.3 ...; +; store i32 %sub.i251.2 ...] +; the last scheduled instruction expected to be store i32 %sub.i251.2... , but it is +; store i32 %sub.i251.5... after scheduling. + +define dso_local void @foo() { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX_I252_2:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* undef, i64 0, i64 13 +; CHECK-NEXT: [[ARRAYIDX_I252_3:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* undef, i64 0, i64 12 +; CHECK-NEXT: [[ARRAYIDX_I252_4:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* undef, i64 0, i64 11 +; CHECK-NEXT: [[ARRAYIDX_I252_5:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* undef, i64 0, i64 10 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX_I252_5]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> undef, <4 x i32>* [[TMP0]], align 8 +; CHECK-NEXT: unreachable +; +entry: + %sub.i251.2 = sub nsw i32 0, undef + %arrayidx.i252.2 = getelementptr inbounds [32 x i32], [32 x i32]* undef, i64 0, i64 13 + store i32 %sub.i251.2, i32* %arrayidx.i252.2, align 4 + %sub.i251.3 = sub nsw i32 undef, undef + %arrayidx.i252.3 = getelementptr inbounds [32 x i32], [32 x i32]* undef, i64 0, i64 12 + store i32 %sub.i251.3, i32* %arrayidx.i252.3, align 16 + %sub.i251.4 = sub nsw i32 undef, undef + %arrayidx.i252.4 = getelementptr inbounds [32 x i32], [32 x i32]* undef, i64 0, i64 11 + store i32 %sub.i251.4, i32* %arrayidx.i252.4, align 4 + %sub.i251.5 = sub nsw i32 undef, undef + %arrayidx.i252.5 = getelementptr inbounds [32 x i32], [32 x i32]* undef, i64 0, i64 10 + store i32 %sub.i251.5, i32* %arrayidx.i252.5, align 8 + unreachable +}