Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -942,6 +942,18 @@ return nullptr; } + ScheduleData *getScheduleData(Value *V, Value *Key) { + if (V == Key) + return getScheduleData(V); + auto I = ExtraScheduleDataMap.find(V); + if (I != ExtraScheduleDataMap.end()) { + ScheduleData *SD = I->second[Key]; + if (SD && SD->SchedulingRegionID == SchedulingRegionID) + return SD; + } + return nullptr; + } + bool isInSchedulingRegion(ScheduleData *SD) { return SD->SchedulingRegionID == SchedulingRegionID; } @@ -955,19 +967,29 @@ ScheduleData *BundleMember = SD; while (BundleMember) { + if (BundleMember->Inst != BundleMember->OpValue) { + BundleMember = BundleMember->NextInBundle; + continue; + } // Handle the def-use chain dependencies. for (Use &U : BundleMember->Inst->operands()) { - ScheduleData *OpDef = getScheduleData(U.get()); - if (OpDef && OpDef->hasValidDependencies() && - OpDef->incrementUnscheduledDeps(-1) == 0) { - // There are no more unscheduled dependencies after decrementing, - // so we can put the dependent instruction into the ready list. - ScheduleData *DepBundle = OpDef->FirstInBundle; - assert(!DepBundle->IsScheduled && - "already scheduled bundle gets ready"); - ReadyList.insert(DepBundle); - DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n"); - } + auto *I = dyn_cast(U.get()); + if (!I) + continue; + doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) { + if (OpDef && OpDef->hasValidDependencies() && + OpDef->incrementUnscheduledDeps(-1) == 0) { + // There are no more unscheduled dependencies after + // decrementing, so we can put the dependent instruction + // into the ready list. + ScheduleData *DepBundle = OpDef->FirstInBundle; + assert(!DepBundle->IsScheduled && + "already scheduled bundle gets ready"); + ReadyList.insert(DepBundle); + DEBUG(dbgs() + << "SLP: gets ready (def): " << *DepBundle << "\n"); + } + }); } // Handle the memory dependencies. for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { @@ -978,22 +1000,35 @@ assert(!DepBundle->IsScheduled && "already scheduled bundle gets ready"); ReadyList.insert(DepBundle); - DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n"); + DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle + << "\n"); } } BundleMember = BundleMember->NextInBundle; } } + void doForAllOpcodes(Value *V, + function_ref Action) { + if (ScheduleData *SD = getScheduleData(V)) + Action(SD); + auto I = ExtraScheduleDataMap.find(V); + if (I != ExtraScheduleDataMap.end()) + for (auto &P : I->second) + if (P.second->SchedulingRegionID == SchedulingRegionID) + Action(P.second); + } + /// Put all instructions into the ReadyList which are ready for scheduling. template void initialFillReadyList(ReadyListType &ReadyList) { for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - ScheduleData *SD = getScheduleData(I); - if (SD->isSchedulingEntity() && SD->isReady()) { - ReadyList.insert(SD); - DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n"); - } + doForAllOpcodes(I, [&ReadyList, I](ScheduleData *SD) { + if (SD->isSchedulingEntity() && SD->isReady()) { + ReadyList.insert(SD); + DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n"); + } + }); } } @@ -1005,9 +1040,12 @@ /// Un-bundles a group of instructions. void cancelScheduling(ArrayRef VL, Value *OpValue); + /// Allocates schedule data chunk. + ScheduleData *allocateScheduleDataChunks(); + /// Extends the scheduling region so that V is inside the region. /// \returns true if the region size is within the limit. - bool extendSchedulingRegion(Value *V); + bool extendSchedulingRegion(Value *V, Value *OpValue); /// Initialize the ScheduleData structures for new instructions in the /// scheduling region. @@ -1040,6 +1078,10 @@ /// ScheduleData structures are recycled. DenseMap ScheduleDataMap; + /// Attaches ScheduleData to Instruction with the leading key. + DenseMap> + ExtraScheduleDataMap; + struct ReadyList : SmallVector { void insert(ScheduleData *SD) { push_back(SD); } }; @@ -3279,7 +3321,7 @@ // Make sure that the scheduling region contains all // instructions of the bundle. for (Value *V : VL) { - if (!extendSchedulingRegion(V)) + if (!extendSchedulingRegion(V, OpValue)) return false; } @@ -3316,8 +3358,9 @@ // It is seldom that this needs to be done a second time after adding the // initial bundle to the region. for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - ScheduleData *SD = getScheduleData(I); - SD->clearDependencies(); + doForAllOpcodes(I, [](ScheduleData *SD) { + SD->clearDependencies(); + }); } ReSchedule = true; } @@ -3378,17 +3421,43 @@ } } -bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { - if (getScheduleData(V)) +BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { + // Allocate a new ScheduleData for the instruction. + if (ChunkPos >= ChunkSize) { + ScheduleDataChunks.push_back(llvm::make_unique(ChunkSize)); + ChunkPos = 0; + } + return &(ScheduleDataChunks.back()[ChunkPos++]); +} + +bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, + Value *OpValue) { + if (getScheduleData(V, isOneOf(OpValue, V))) return true; Instruction *I = dyn_cast(V); assert(I && "bundle member must be an instruction"); assert(!isa(I) && "phi nodes don't need to be scheduled"); + auto &&CheckSheduleForI = [this, OpValue](Instruction *I) -> bool { + ScheduleData *ISD = getScheduleData(I); + if (!ISD) + return false; + assert(isInSchedulingRegion(ISD) && + "ScheduleData not in scheduling region"); + ScheduleData *SD = allocateScheduleDataChunks(); + SD->Inst = I; + SD->init(SchedulingRegionID, OpValue); + ExtraScheduleDataMap[I][OpValue] = SD; + return true; + }; + if (CheckSheduleForI(I)) + return true; if (!ScheduleStart) { // It's the first instruction in the new region. initScheduleData(I, I->getNextNode(), nullptr, nullptr); ScheduleStart = I; ScheduleEnd = I->getNextNode(); + if (isOneOf(OpValue, I) != I) + CheckSheduleForI(I); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); return true; @@ -3410,6 +3479,8 @@ if (&*UpIter == I) { initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); ScheduleStart = I; + if (isOneOf(OpValue, I) != I) + CheckSheduleForI(I); DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); return true; } @@ -3420,6 +3491,8 @@ initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, nullptr); ScheduleEnd = I->getNextNode(); + if (isOneOf(OpValue, I) != I) + CheckSheduleForI(I); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); return true; @@ -3446,7 +3519,7 @@ llvm::make_unique(ChunkSize)); ChunkPos = 0; } - SD = &(ScheduleDataChunks.back()[ChunkPos++]); + SD = allocateScheduleDataChunks(); ScheduleDataMap[I] = SD; SD->Inst = I; } @@ -3494,23 +3567,35 @@ BundleMember->resetUnscheduledDeps(); // Handle def-use chain dependencies. - for (User *U : BundleMember->Inst->users()) { - if (isa(U)) { - ScheduleData *UseSD = getScheduleData(U); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { + if (BundleMember->OpValue != BundleMember->Inst) { + ScheduleData *UseSD = getScheduleData(BundleMember->Inst); + if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { + BundleMember->Dependencies++; + ScheduleData *DestBundle = UseSD->FirstInBundle; + if (!DestBundle->IsScheduled) + BundleMember->incrementUnscheduledDeps(1); + if (!DestBundle->hasValidDependencies()) + WorkList.push_back(DestBundle); + } + } else { + for (User *U : BundleMember->Inst->users()) { + if (isa(U)) { + ScheduleData *UseSD = getScheduleData(U); + if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { + BundleMember->Dependencies++; + ScheduleData *DestBundle = UseSD->FirstInBundle; + if (!DestBundle->IsScheduled) + BundleMember->incrementUnscheduledDeps(1); + if (!DestBundle->hasValidDependencies()) + WorkList.push_back(DestBundle); + } + } else { + // I'm not sure if this can ever happen. But we need to be safe. + // This lets the instruction/bundle never be scheduled and + // eventually disable vectorization. BundleMember->Dependencies++; - ScheduleData *DestBundle = UseSD->FirstInBundle; - if (!DestBundle->IsScheduled) - BundleMember->incrementUnscheduledDeps(1); - if (!DestBundle->hasValidDependencies()) - WorkList.push_back(DestBundle); + BundleMember->incrementUnscheduledDeps(1); } - } else { - // I'm not sure if this can ever happen. But we need to be safe. - // This lets the instruction/bundle never be scheduled and - // eventually disable vectorization. - BundleMember->Dependencies++; - BundleMember->incrementUnscheduledDeps(1); } } @@ -3587,10 +3672,12 @@ assert(ScheduleStart && "tried to reset schedule on block which has not been scheduled"); for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - ScheduleData *SD = getScheduleData(I); - assert(isInSchedulingRegion(SD)); - SD->IsScheduled = false; - SD->resetUnscheduledDeps(); + doForAllOpcodes(I, [this](ScheduleData *SD) { + assert(isInSchedulingRegion(SD) && + "ScheduleData not in scheduling region"); + SD->IsScheduled = false; + SD->resetUnscheduledDeps(); + }); } ReadyInsts.clear(); } @@ -3620,15 +3707,16 @@ int NumToSchedule = 0; for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { - ScheduleData *SD = BS->getScheduleData(I); - assert( - SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) && - "scheduler and vectorizer have different opinion on what is a bundle"); - SD->FirstInBundle->SchedulingPriority = Idx++; - if (SD->isSchedulingEntity()) { - BS->calculateDependencies(SD, false, this); - NumToSchedule++; - } + BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { + assert(SD->isPartOfBundle() == + (getTreeEntry(SD->Inst) != nullptr) && + "scheduler and vectorizer bundle mismatch"); + SD->FirstInBundle->SchedulingPriority = Idx++; + if (SD->isSchedulingEntity()) { + BS->calculateDependencies(SD, false, this); + NumToSchedule++; + } + }); } BS->initialFillReadyList(ReadyInsts); Index: test/Transforms/SLPVectorizer/X86/schedul-bundel.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/schedul-bundel.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -slp-vectorizer -slp-vectorizer -mcpu=bdver1 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = common local_unnamed_addr global [1 x i32] zeroinitializer, align 4 +@b = common local_unnamed_addr global [1 x i32] zeroinitializer, align 4 + +define i32 @slp_schedule_bundle() local_unnamed_addr #0 { +; CHECK-LABEL: @slp_schedule_bundle( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([1 x i32]* @b to <4 x i32>*), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = lshr <4 x i32> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> , [[TMP1]] +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([1 x i32]* @a to <4 x i32>*), align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4 +; CHECK-NEXT: [[DOTLOBIT_4:%.*]] = lshr i32 [[TMP3]], 31 +; CHECK-NEXT: [[DOTLOBIT_NOT_4:%.*]] = xor i32 [[DOTLOBIT_4]], 1 +; CHECK-NEXT: store i32 [[DOTLOBIT_NOT_4]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 5, i64 0), align 4 +; CHECK-NEXT: [[DOTLOBIT_5:%.*]] = lshr i32 [[TMP4]], 31 +; CHECK-NEXT: [[DOTLOBIT_NOT_5:%.*]] = xor i32 [[DOTLOBIT_5]], 1 +; CHECK-NEXT: store i32 [[DOTLOBIT_NOT_5]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 5, i64 0), align 4 +; CHECK-NEXT: ret i32 undef +; +entry: + %0 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @b, i64 0, i64 0), align 4 + %.lobit = lshr i32 %0, 31 + %.lobit.not = xor i32 %.lobit, 1 + store i32 %.lobit.not, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @a, i64 0, i64 0), align 4 + %1 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @b, i64 1, i64 0), align 4 + %.lobit.1 = lshr i32 %1, 31 + %.lobit.not.1 = xor i32 %.lobit.1, 1 + store i32 %.lobit.not.1, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @a, i64 1, i64 0), align 4 + %2 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 2, i64 0), align 4 + %.lobit.2 = lshr i32 %2, 31 + %.lobit.not.2 = xor i32 %.lobit.2, 1 + store i32 %.lobit.not.2, i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 2, i64 0), align 4 + %3 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 3, i64 0), align 4 + %.lobit.3 = lshr i32 %3, 31 + %.lobit.not.3 = xor i32 %.lobit.3, 1 + store i32 %.lobit.not.3, i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 3, i64 0), align 4 + %4 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4 + %.lobit.4 = lshr i32 %4, 31 + %.lobit.not.4 = xor i32 %.lobit.4, 1 + store i32 %.lobit.not.4, i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0), align 4 + %5 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 5, i64 0), align 4 + %.lobit.5 = lshr i32 %5, 31 + %.lobit.not.5 = xor i32 %.lobit.5, 1 + store i32 %.lobit.not.5, i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 5, i64 0), align 4 + ret i32 undef +}