Index: lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- lib/Target/PowerPC/PPCISelLowering.h +++ lib/Target/PowerPC/PPCISelLowering.h @@ -17,6 +17,7 @@ #include "PPC.h" #include "PPCInstrInfo.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" @@ -540,6 +541,32 @@ class PPCTargetLowering : public TargetLowering { const PPCSubtarget &Subtarget; + struct JTCallBack final : public CallbackVH { + JTCallBack(const Value *V, const PPCTargetLowering *TL = nullptr) + : CallbackVH(const_cast(V)), PPCTLI(TL) { } + // If we RAUW, we don't know anything about the new value, so just + // erase it from all the sets. + void allUsesReplacedWith(Value *V) override { + deleted(); + } + void deleted() override { + if (PPCTLI) { + PPCTLI->KnownNoJTSwitches.erase(*this); + PPCTLI->BlocksKnownToUseCTR.erase(*this); + PPCTLI->BlocksKnownNotToUseCTR.erase(*this); + } + } + private: + const PPCTargetLowering *PPCTLI; + }; + friend struct JTCallBack; + + mutable SmallSet KnownNoJTSwitches; + mutable SmallSet BlocksKnownToUseCTR; + mutable SmallSet BlocksKnownNotToUseCTR; + // The threshold for the number of cases a switch should have for jump + // tables to be profitable regardless of whether any cases have calls. + static unsigned NumCasesForJTWithCalls; public: explicit PPCTargetLowering(const PPCTargetMachine &TM, const PPCSubtarget &STI); @@ -565,6 +592,31 @@ bool useSoftFloat() const override; + // A jump table is emitted as lwax -> mtctr -> bctr. All three of those + // are high latency instructions, so we don't want to emit jump tables for + // switches with fewer than 5 cases as the jump table overhead will outweigh + // the benefit. Please note that a jump table will only be emitted for a + // switch with so few cases if there are no calls or loop headers in any + // of the case target blocks. + // The lwax -> mtctr sequence takes 11 cycles whereas each + // cmpli takes 2 cycles and can dispatch 4-wide. Assuming the + // worst case - last branch is taken and linear comparisons, we can + // (theoretically) complete 6 cmpli's in 3 cycles and then the 6 branches + // in 8 cycles with a single cycle overlap for a total of 10 cycles. + // However, in practice (as confirmed by benchmarking on a Power9 machine) + // a jump table can be the better choice for switches as small as 5 cases + // as long as there are no calls in the target blocks. + unsigned getMinimumJumpTableEntries() const override { + return 5; + } + /// Does the basic block use the count register (CTR)? This will happen if + /// the block has a call or is terminated by a switch that might be lowered + /// to a jump table. + bool mightUseCTR(const BasicBlock *BB) const; + + bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, + uint64_t Range) const override; + MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override { return MVT::i32; } Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -113,12 +113,22 @@ STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumSiblingCalls, "Number of sibling calls"); +STATISTIC(NumCallsToIsSuitableForJT, + "Number of times a switch is checked for JT eligibility"); +STATISTIC(NumSkippedAlreadyExaminedSwitches, + "Number of switches previously proven not eligible for JT"); +STATISTIC(NumSkippedAlreadyVisitedCases, + "Number of cases not examined for CTR use due to previous visits"); +STATISTIC(NumSwitchesNotSuitableForJT, + "Number of switch instructions not suitable for jump tables"); static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int); // FIXME: Remove this once the bug has been fixed! extern cl::opt ANDIGlueBug; +unsigned PPCTargetLowering::NumCasesForJTWithCalls = 64; + PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, const PPCSubtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -1130,6 +1140,89 @@ return Subtarget.useSoftFloat(); } +bool PPCTargetLowering::mightUseCTR(const BasicBlock *BB) const { + // Skip blocks known to use the CTR. + if (BlocksKnownToUseCTR.count(BB)) { + NumSkippedAlreadyVisitedCases++; + return true; + } + if (BlocksKnownNotToUseCTR.count(BB)) { + NumSkippedAlreadyVisitedCases++; + return false; + } + for (auto &Inst : *BB) { + // Benchmarking shows that there really isn't any difference between + // direct and indirect calls when it comes to jump table profitability. + if (isa(&Inst)) { + BlocksKnownToUseCTR.insert(JTCallBack(BB, this)); + return true; + } else if (const SwitchInst *SI = dyn_cast(&Inst)) { + // Does the block have a switch instruction large enough to actually + // be lowered to a jump table? If so, conservatively assume that one + // will be lowered to a jump table. + if ((SI->getNumCases() >= getMinimumJumpTableEntries())) { + BlocksKnownToUseCTR.insert(JTCallBack(BB, this)); + return true; + } + } + } + BlocksKnownNotToUseCTR.insert(JTCallBack(BB, this)); + return false; +} + +// This function is called a huge number of times so we try to limit the number +// of traversals of all basic blocks. Just as an example, a bootstrap build +// calls this function 11,822,805 times on 7554 unique SwitchInst instances. We +// conservatively re-check any SwitchInst's that were previously suitable for a +// jump table to ensure that some transformation hasn't made it unsuitable. +bool PPCTargetLowering::isSuitableForJumpTable(const SwitchInst *SI, + uint64_t NumCases, + uint64_t Range) const { + NumCallsToIsSuitableForJT++; + if (!TargetLoweringBase::isSuitableForJumpTable(SI, NumCases, Range)) + return false; + + // Jump tables are fine with very large switch statements. + bool LargeSwitch = SI->getNumCases() >= NumCasesForJTWithCalls; + if (LargeSwitch) { + BlocksKnownToUseCTR.insert(JTCallBack(SI->getParent(), this)); + BlocksKnownNotToUseCTR.erase(SI->getParent()); + return true; + } + + // If we already know that we don't want to lower this SwitchInst to + // a jump table, no need to check further. + if (KnownNoJTSwitches.count(SI)) { + NumSkippedAlreadyExaminedSwitches++; + return false; + } + + // At this point we know we're not looking at a huge switch, so a CTR + // use in any of the cases makes this SI not suitable for a jump table. + // FIXME: a switch shouldn't use a jump table only if a case that is + // actually visited has a use of the CTR. Perhaps branch probability + // should be considered here in the future (i.e. if the branch probability + // is way less than 1 / NumCases). + for (auto It : SI->cases()) { + if (mightUseCTR(It.getCaseSuccessor())) { + NumSwitchesNotSuitableForJT++; + KnownNoJTSwitches.insert(JTCallBack(SI, this)); + DEBUG(dbgs() << "In function: " << + SI->getParent()->getParent()->getName()); + DEBUG(dbgs() << "\nSwitch instruction:\n"); + DEBUG(SI->dump()); + DEBUG(dbgs() << "Not suitable for a jump table due to possible CTR " + "use in successor:\n"); + DEBUG(It.getCaseSuccessor()->dump()); + return false; + } + } + // The containing block now uses the CTR. + BlocksKnownToUseCTR.insert(JTCallBack(SI->getParent(), this)); + BlocksKnownNotToUseCTR.erase(SI->getParent()); + return true; +} + const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((PPCISD::NodeType)Opcode) { case PPCISD::FIRST_NUMBER: break; Index: test/CodeGen/PowerPC/NoJTsDueToCTRUse.ll =================================================================== --- test/CodeGen/PowerPC/NoJTsDueToCTRUse.ll +++ test/CodeGen/PowerPC/NoJTsDueToCTRUse.ll @@ -0,0 +1,258 @@ +; RUN: llc -verify-machineinstrs -debug-only=ppc-lowering < %s > /dev/null 2>%t && FileCheck <%t %s +; REQUIRES: asserts +declare void @llvm.ppc.mtctr.i64(i64) + +define fastcc void @case2() { +; CHECK: In function: case2 +; CHECK: Switch instruction: +; CHECK: switch i32 %0, label %sw.default [ +; CHECK: Not suitable for a jump table due to possible CTR use in successor: +; CHEC: sw.bb20: +entry: + switch i16 undef, label %sw.default [ + i16 54, label %sw.bb + i16 55, label %sw.bb3 + i16 57, label %sw.bb14 + i16 68, label %sw.bb20 + i16 67, label %sw.bb27 + i16 58, label %sw.bb36 + i16 134, label %sw.bb45 + i16 61, label %sw.bb54 + i16 63, label %sw.bb82 + i16 62, label %while.body.i486 + i16 69, label %while.body.i499 + i16 59, label %while.body.i512 + i16 66, label %sw.bb102 + ] + +sw.bb: ; preds = %entry + unreachable + +sw.bb3: ; preds = %entry + unreachable + +sw.bb14: ; preds = %entry + unreachable + +sw.bb20: ; preds = %entry + br label %land.rhs.i258 + +land.rhs.i258: ; preds = %while.body.i262, %sw.bb20 + %cur.addr.019.i255.idx = phi i64 [ 1, %sw.bb20 ], [ %cur.addr.019.i255.add, %while.body.i262 ] + br label %while.body.i262 + +while.body.i262: ; preds = %land.rhs.i258 + %cur.addr.019.i255.add = add nuw nsw i64 %cur.addr.019.i255.idx, 1 + %cmp2.i261 = icmp ult i64 %cur.addr.019.i255.idx, 2045 + br i1 %cmp2.i261, label %land.rhs.i258, label %safe_concat.exit264 + +safe_concat.exit264: ; preds = %while.body.i262 + unreachable + +sw.bb27: ; preds = %entry + unreachable + +sw.bb36: ; preds = %entry + unreachable + +sw.bb45: ; preds = %entry + unreachable + +sw.bb54: ; preds = %entry + unreachable + +sw.bb82: ; preds = %entry + unreachable + +while.body.i486: ; preds = %entry + unreachable + +while.body.i499: ; preds = %entry + unreachable + +while.body.i512: ; preds = %entry + ret void + +sw.bb102: ; preds = %entry + unreachable + +sw.default: ; preds = %entry + unreachable +} + +define void @case4() { +; CHECK: In function: case4 +; CHECK: Switch instruction: +; CHECK: switch i32 undef, label %sw.default660 [ +; CHECK: Not suitable for a jump table due to possible CTR use in successor: +; CHECK: sw.bb466: +; CHECK: tail call void undef(i8* undef) +entry: + switch i32 undef, label %sw.default660 [ + i32 0, label %sw.bb + i32 20, label %sw.bb458 + i32 19, label %sw.bb444 + i32 15, label %sw.bb361 + i32 29, label %sw.bb44 + i32 13, label %sw.bb53 + i32 17, label %sw.bb377 + i32 16, label %sw.bb372 + i32 25, label %sw.bb535 + i32 24, label %sw.bb484 + i32 36, label %sw.bb366 + i32 14, label %sw.bb315 + i32 12, label %sw.bb310 + i32 30, label %sw.bb466 + ] + +sw.bb: ; preds = %entry + unreachable + +sw.bb44: ; preds = %entry + unreachable + +sw.bb53: ; preds = %entry + unreachable + +sw.bb310: ; preds = %entry + unreachable + +sw.bb315: ; preds = %entry + unreachable + +sw.bb361: ; preds = %entry + unreachable + +sw.bb366: ; preds = %entry + unreachable + +sw.bb372: ; preds = %entry + unreachable + +sw.bb377: ; preds = %entry + unreachable + +sw.bb444: ; preds = %entry + unreachable + +sw.bb458: ; preds = %entry + unreachable + +sw.bb466: ; preds = %entry + tail call void undef(i8* undef) #1 + unreachable + +sw.bb484: ; preds = %entry + unreachable + +sw.bb535: ; preds = %entry + unreachable + +sw.default660: ; preds = %entry + unreachable +} + +define void @case5(i1 %in) { +; CHECK: In function: case5 +; CHECK: Switch instruction: +; CHECK: switch i32 %0, label %sw.default [ +; CHECK: Not suitable for a jump table due to possible CTR use in successor: +; CHECK: sw.bb189: ; preds = %if.end, %if.end, %if.end, %if.end +; CHECK: switch i32 %1, label %sw.default.i [ +entry: + br i1 %in, label %cleanup, label %if.end + +if.end: ; preds = %entry + switch i8 undef, label %sw.default [ + i8 0, label %undef_sstr + i8 1, label %sw.bb31 + i8 2, label %sw.bb80 + i8 3, label %sw.bb124 + i8 4, label %sw.bb171 + i8 14, label %sw.bb171 + i8 5, label %sw.bb177 + i8 6, label %sw.bb183 + i8 10, label %sw.bb189 + i8 11, label %sw.bb189 + i8 12, label %sw.bb189 + i8 15, label %sw.bb189 + i8 13, label %sw.bb201 + ] + +undef_sstr: ; preds = %sw.bb31, %if.end + unreachable + +sw.bb31: ; preds = %if.end + br label %undef_sstr + +sw.bb80: ; preds = %if.end + unreachable + +sw.bb124: ; preds = %if.end + unreachable + +sw.bb171: ; preds = %if.end, %if.end + unreachable + +sw.bb177: ; preds = %if.end + unreachable + +sw.bb183: ; preds = %if.end + unreachable + +sw.bb189: ; preds = %if.end, %if.end, %if.end, %if.end + switch i8 undef, label %sw.default.i [ + i8 0, label %sw.bb.i + i8 1, label %sw.bb.i + i8 2, label %sw.bb.i + i8 3, label %sw.bb.i + i8 4, label %sw.bb.i + i8 5, label %sw.bb.i + i8 6, label %sw.bb.i + i8 7, label %sw.bb.i + i8 8, label %sw.bb.i + i8 9, label %sw.bb11.i + i8 10, label %Perl_sv_reftype.exit + i8 11, label %sw.bb28.i + i8 12, label %sw.bb29.i + i8 13, label %sw.bb30.i + i8 14, label %sw.bb31.i + i8 15, label %sw.bb32.i + ] + +sw.bb.i: ; preds = %sw.bb189, %sw.bb189, %sw.bb189, %sw.bb189, %sw.bb189, %sw.bb189, %sw.bb189, %sw.bb189, %sw.bb189 + unreachable + +sw.bb11.i: ; preds = %sw.bb189 + unreachable + +sw.bb28.i: ; preds = %sw.bb189 + unreachable + +sw.bb29.i: ; preds = %sw.bb189 + unreachable + +sw.bb30.i: ; preds = %sw.bb189 + unreachable + +sw.bb31.i: ; preds = %sw.bb189 + unreachable + +sw.bb32.i: ; preds = %sw.bb189 + unreachable + +sw.default.i: ; preds = %sw.bb189 + unreachable + +Perl_sv_reftype.exit: ; preds = %sw.bb189 + unreachable + +sw.bb201: ; preds = %if.end + unreachable + +sw.default: ; preds = %if.end + unreachable + +cleanup: ; preds = %entry + ret void +} Index: test/CodeGen/PowerPC/mcm-5.ll =================================================================== --- test/CodeGen/PowerPC/mcm-5.ll +++ test/CodeGen/PowerPC/mcm-5.ll @@ -17,6 +17,10 @@ i32 4, label %sw.bb1 i32 5, label %sw.bb2 i32 6, label %sw.bb3 + i32 7, label %sw.bb + i32 8, label %sw.bb1 + i32 9, label %sw.bb2 + i32 10, label %sw.bb3 ] sw.default: ; preds = %entry