Index: lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- lib/Target/PowerPC/PPCISelLowering.h +++ lib/Target/PowerPC/PPCISelLowering.h @@ -17,6 +17,7 @@ #include "PPC.h" #include "PPCInstrInfo.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" @@ -32,6 +33,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Type.h" #include +#include namespace llvm { @@ -540,6 +542,32 @@ class PPCTargetLowering : public TargetLowering { const PPCSubtarget &Subtarget; + struct JTCallBack final : public CallbackVH { + JTCallBack(const Value *V, const PPCTargetLowering *TL = nullptr) + : CallbackVH(const_cast(V)), PPCTLI(TL) { } + // If we RAUW, we don't know anything about the new value, so just + // erase it from all the sets. + void allUsesReplacedWith(Value *V) override { + deleted(); + } + void deleted() override { + if (PPCTLI) { + PPCTLI->KnownNoJTSwitches.erase(*this); + PPCTLI->BlocksKnownToUseCTR.erase(*this); + PPCTLI->BlocksKnownNotToUseCTR.erase(*this); + } + } + private: + const PPCTargetLowering *PPCTLI; + }; + friend struct JTCallBack; + + mutable std::set KnownNoJTSwitches; + mutable std::set BlocksKnownToUseCTR; + mutable std::set BlocksKnownNotToUseCTR; + // The threshold for the number of cases a switch should have for jump + // tables to be profitable regardless of whether any cases have calls. + static unsigned NumCasesForJTWithCalls; public: explicit PPCTargetLowering(const PPCTargetMachine &TM, const PPCSubtarget &STI); @@ -565,6 +593,31 @@ bool useSoftFloat() const override; + // A jump table is emitted as lwax -> mtctr -> bctr. All three of those + // are high latency instructions, so we don't want to emit jump tables for + // switches with fewer than 5 cases as the jump table overhead will outweigh + // the benefit. Please note that a jump table will only be emitted for a + // switch with so few cases if there are no calls or loop headers in any + // of the case target blocks. + // The lwax -> mtctr sequence takes 11 cycles whereas each + // cmpli takes 2 cycles and can dispatch 4-wide. Assuming the + // worst case - last branch is taken and linear comparisons, we can + // (theoretically) complete 6 cmpli's in 3 cycles and then the 6 branches + // in 8 cycles with a single cycle overlap for a total of 10 cycles. + // However, in practice (as confirmed by benchmarking on a Power9 machine) + // a jump table can be the better choice for switches as small as 5 cases + // as long as there are no calls in the target blocks. + unsigned getMinimumJumpTableEntries() const override { + return 5; + } + /// Does the basic block use the count register (CTR)? This will happen if + /// the block has a call or is terminated by a switch that might be lowered + /// to a jump table. + bool mightUseCTR(const BasicBlock *BB) const; + + bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, + uint64_t Range) const override; + MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override { return MVT::i32; } Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -113,12 +113,22 @@ STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumSiblingCalls, "Number of sibling calls"); +STATISTIC(NumCallsToIsSuitableForJT, + "Number of times a switch is checked for JT eligibility"); +STATISTIC(NumSkippedAlreadyExaminedSwitches, + "Number of switches previously proven not eligible for JT"); +STATISTIC(NumSkippedAlreadyVisitedCases, + "Number of cases not examined for CTR use due to previous visits"); +STATISTIC(NumSwitchesNotSuitableForJT, + "Number of switch instructions not suitable for jump tables"); static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int); // FIXME: Remove this once the bug has been fixed! extern cl::opt ANDIGlueBug; +unsigned PPCTargetLowering::NumCasesForJTWithCalls = 64; + PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, const PPCSubtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -1130,6 +1140,89 @@ return Subtarget.useSoftFloat(); } +bool PPCTargetLowering::mightUseCTR(const BasicBlock *BB) const { + // Skip blocks known to use the CTR. + if (BlocksKnownToUseCTR.count(BB)) { + NumSkippedAlreadyVisitedCases++; + return true; + } + if (BlocksKnownNotToUseCTR.count(BB)) { + NumSkippedAlreadyVisitedCases++; + return false; + } + for (auto &Inst : *BB) { + // Benchmarking shows that there really isn't any difference between + // direct and indirect calls when it comes to jump table profitability. + if (isa(&Inst)) { + BlocksKnownToUseCTR.emplace(BB, this); + return true; + } else if (const SwitchInst *SI = dyn_cast(&Inst)) { + // Does the block have a switch instruction large enough to actually + // be lowered to a jump table? If so, conservatively assume that one + // will be lowered to a jump table. + if ((SI->getNumCases() >= getMinimumJumpTableEntries())) { + BlocksKnownToUseCTR.emplace(BB, this); + return true; + } + } + } + BlocksKnownNotToUseCTR.emplace(BB, this); + return false; +} + +// This function is called a huge number of times so we try to limit the number +// of traversals of all basic blocks. Just as an example, a bootstrap build +// calls this function 11,822,805 times on 7554 unique SwitchInst instances. We +// conservatively re-check any SwitchInst's that were previously suitable for a +// jump table to ensure that some transformation hasn't made it unsuitable. +bool PPCTargetLowering::isSuitableForJumpTable(const SwitchInst *SI, + uint64_t NumCases, + uint64_t Range) const { + NumCallsToIsSuitableForJT++; + if (!TargetLoweringBase::isSuitableForJumpTable(SI, NumCases, Range)) + return false; + + // Jump tables are fine with very large switch statements. + bool LargeSwitch = SI->getNumCases() >= NumCasesForJTWithCalls; + if (LargeSwitch) { + BlocksKnownToUseCTR.emplace(SI->getParent(), this); + BlocksKnownNotToUseCTR.erase(SI->getParent()); + return true; + } + + // If we already know that we don't want to lower this SwitchInst to + // a jump table, no need to check further. + if (KnownNoJTSwitches.count(SI)) { + NumSkippedAlreadyExaminedSwitches++; + return false; + } + + // At this point we know we're not looking at a huge switch, so a CTR + // use in any of the cases makes this SI not suitable for a jump table. + // FIXME: a switch shouldn't use a jump table only if a case that is + // actually visited has a use of the CTR. Perhaps branch probability + // should be considered here in the future (i.e. if the branch probability + // is way less than 1 / NumCases). + for (auto It : SI->cases()) { + if (mightUseCTR(It.getCaseSuccessor())) { + NumSwitchesNotSuitableForJT++; + KnownNoJTSwitches.emplace(SI, this); + DEBUG(dbgs() << "In function: " << + SI->getParent()->getParent()->getName()); + DEBUG(dbgs() << "\nSwitch instruction:\n"); + DEBUG(SI->dump()); + DEBUG(dbgs() << "Not suitable for a jump table due to possible CTR " + "use in successor:\n"); + DEBUG(It.getCaseSuccessor()->dump()); + return false; + } + } + // The containing block now uses the CTR. + BlocksKnownToUseCTR.emplace(SI->getParent(), this); + BlocksKnownNotToUseCTR.erase(SI->getParent()); + return true; +} + const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((PPCISD::NodeType)Opcode) { case PPCISD::FIRST_NUMBER: break; Index: test/CodeGen/PowerPC/NoJTsDueToCTRUse.ll =================================================================== --- test/CodeGen/PowerPC/NoJTsDueToCTRUse.ll +++ test/CodeGen/PowerPC/NoJTsDueToCTRUse.ll @@ -0,0 +1,396 @@ +; RUN: llc -verify-machineinstrs -debug-only=ppc-lowering < %s > /dev/null 2>%t && FileCheck <%t %s +; REQUIRES: asserts +define signext i32 @case1(i32 signext %a, i32 signext %b, i32 signext %c) { +entry: +; CHECK: In function: case1 +; CHECK: The switch instruction in following block isn't suitable for a Jump Table: +; CHECK: entry: +; CHECK: call void @llvm.ppc.mtctr.i64(i64 2045) + call void @llvm.ppc.mtctr.i64(i64 2045) + switch i32 %b, label %sw.epilog [ + i32 1, label %sw.bb + i32 2, label %sw.bb1 + i32 3, label %sw.bb2 + i32 4, label %sw.bb4 + i32 5, label %sw.bb7 + i32 6, label %sw.bb9 + i32 21, label %sw.bb11 + i32 22, label %sw.bb14 + i32 23, label %sw.bb17 + i32 24, label %sw.bb20 + ] + +sw.bb: ; preds = %entry + br label %sw.epilog + +sw.bb1: ; preds = %entry + br label %sw.epilog + +sw.bb2: ; preds = %entry + br label %sw.epilog + +sw.bb4: ; preds = %entry + %add6 = add nsw i32 %a, 15 + br label %sw.epilog + +sw.bb7: ; preds = %entry + %0 = mul i32 %a, -89 + %sub8 = add i32 %0, 5 + br label %sw.epilog + +sw.bb9: ; preds = %entry + %div = sdiv i32 89, %a + %sub10 = sub nsw i32 6, %div + br label %sw.epilog + +sw.bb11: ; preds = %entry + %div12 = sdiv i32 89, %a + %add13 = add nsw i32 %div12, 21 + br label %sw.epilog + +sw.bb14: ; preds = %entry + %mul15 = mul nsw i32 %a, 89 + %add16 = add nsw i32 %mul15, 22 + br label %sw.epilog + +sw.bb17: ; preds = %entry + %sub18 = sub i32 89, %a + %add19 = add nsw i32 %sub18, 23 + br label %sw.epilog + +sw.bb20: ; preds = %entry + %1 = mul i32 %a, -11 + %add23 = add i32 %1, 113 + br label %sw.epilog + +sw.epilog: ; preds = %entry, %sw.bb20, %sw.bb17, %sw.bb14, %sw.bb11, %sw.bb9, %sw.bb7, %sw.bb4, %sw.bb2, %sw.bb1, %sw.bb + %b.addr.0 = phi i32 [ %b, %entry ], [ %add23, %sw.bb20 ], [ %add19, %sw.bb17 ], [ %add16, %sw.bb14 ], [ %add13, %sw.bb11 ], [ %sub10, %sw.bb9 ], [ %sub8, %sw.bb7 ], [ %add6, %sw.bb4 ], [ 703, %sw.bb2 ], [ -85, %sw.bb1 ], [ 8, %sw.bb ] + %add24 = add nsw i32 %b.addr.0, %a + ret i32 %add24 +} + +declare void @llvm.ppc.mtctr.i64(i64) + +define fastcc void @case2() { +; CHECK: In function: case2 +; CHECK: Switch instruction: +; CHECK: switch i32 %0, label %sw.default [ +; CHECK: Not suitable for a jump table due to possible CTR use in successor: +; CHEC: sw.bb20: +entry: + switch i16 undef, label %sw.default [ + i16 54, label %sw.bb + i16 55, label %sw.bb3 + i16 57, label %sw.bb14 + i16 68, label %sw.bb20 + i16 67, label %sw.bb27 + i16 58, label %sw.bb36 + i16 134, label %sw.bb45 + i16 61, label %sw.bb54 + i16 63, label %sw.bb82 + i16 62, label %while.body.i486 + i16 69, label %while.body.i499 + i16 59, label %while.body.i512 + i16 66, label %sw.bb102 + ] + +sw.bb: ; preds = %entry + unreachable + +sw.bb3: ; preds = %entry + unreachable + +sw.bb14: ; preds = %entry + unreachable + +sw.bb20: ; preds = %entry + br label %land.rhs.i258 + +land.rhs.i258: ; preds = %while.body.i262, %sw.bb20 + %cur.addr.019.i255.idx = phi i64 [ 1, %sw.bb20 ], [ %cur.addr.019.i255.add, %while.body.i262 ] + br label %while.body.i262 + +while.body.i262: ; preds = %land.rhs.i258 + %cur.addr.019.i255.add = add nuw nsw i64 %cur.addr.019.i255.idx, 1 + %cmp2.i261 = icmp ult i64 %cur.addr.019.i255.idx, 2045 + br i1 %cmp2.i261, label %land.rhs.i258, label %safe_concat.exit264 + +safe_concat.exit264: ; preds = %while.body.i262 + unreachable + +sw.bb27: ; preds = %entry + unreachable + +sw.bb36: ; preds = %entry + unreachable + +sw.bb45: ; preds = %entry + unreachable + +sw.bb54: ; preds = %entry + unreachable + +sw.bb82: ; preds = %entry + unreachable + +while.body.i486: ; preds = %entry + unreachable + +while.body.i499: ; preds = %entry + unreachable + +while.body.i512: ; preds = %entry + ret void + +sw.bb102: ; preds = %entry + unreachable + +sw.default: ; preds = %entry + unreachable +} + +define signext i32 @case3(i32 signext %a, i32 (i32)* nocapture %Fptr) { +; CHECK: In function: case3 +; CHECK: The switch instruction in following block isn't suitable for a Jump Table: +; CHECK: entry: +; CHECK: %call = tail call signext i32 %Fptr(i32 signext %a) +entry: + %call = tail call signext i32 %Fptr(i32 signext %a) #1 + switch i32 %call, label %sw.epilog [ + i32 1, label %sw.bb + i32 2, label %sw.bb1 + i32 3, label %sw.bb2 + i32 4, label %sw.bb4 + i32 5, label %sw.bb7 + i32 6, label %sw.bb9 + i32 21, label %sw.bb11 + i32 22, label %sw.bb14 + i32 23, label %sw.bb17 + i32 24, label %sw.bb20 + ] + +sw.bb: ; preds = %entry + br label %sw.epilog + +sw.bb1: ; preds = %entry + br label %sw.epilog + +sw.bb2: ; preds = %entry + br label %sw.epilog + +sw.bb4: ; preds = %entry + %add6 = add nsw i32 %a, 15 + br label %sw.epilog + +sw.bb7: ; preds = %entry + %0 = mul i32 %a, -89 + %sub8 = add i32 %0, 5 + br label %sw.epilog + +sw.bb9: ; preds = %entry + %div = sdiv i32 89, %a + %sub10 = sub nsw i32 6, %div + br label %sw.epilog + +sw.bb11: ; preds = %entry + %div12 = sdiv i32 89, %a + %add13 = add nsw i32 %div12, 21 + br label %sw.epilog + +sw.bb14: ; preds = %entry + %mul15 = mul nsw i32 %a, 89 + %add16 = add nsw i32 %mul15, 22 + br label %sw.epilog + +sw.bb17: ; preds = %entry + %sub18 = sub i32 89, %a + %add19 = add nsw i32 %sub18, 23 + br label %sw.epilog + +sw.bb20: ; preds = %entry + %1 = mul i32 %a, -11 + %add23 = add i32 %1, 113 + br label %sw.epilog + +sw.epilog: ; preds = %entry, %sw.bb20, %sw.bb17, %sw.bb14, %sw.bb11, %sw.bb9, %sw.bb7, %sw.bb4, %sw.bb2, %sw.bb1, %sw.bb + %b.0 = phi i32 [ %call, %entry ], [ %add23, %sw.bb20 ], [ %add19, %sw.bb17 ], [ %add16, %sw.bb14 ], [ %add13, %sw.bb11 ], [ %sub10, %sw.bb9 ], [ %sub8, %sw.bb7 ], [ %add6, %sw.bb4 ], [ 703, %sw.bb2 ], [ -85, %sw.bb1 ], [ 8, %sw.bb ] + %add24 = add nsw i32 %b.0, %a + ret i32 %add24 +} + +define void @case4() { +; CHECK: In function: case4 +; CHECK: Switch instruction: +; CHECK: switch i32 undef, label %sw.default660 [ +; CHECK: Not suitable for a jump table due to possible CTR use in successor: +; CHECK: sw.bb466: +; CHECK: tail call void undef(i8* undef) +entry: + switch i32 undef, label %sw.default660 [ + i32 0, label %sw.bb + i32 20, label %sw.bb458 + i32 19, label %sw.bb444 + i32 15, label %sw.bb361 + i32 29, label %sw.bb44 + i32 13, label %sw.bb53 + i32 17, label %sw.bb377 + i32 16, label %sw.bb372 + i32 25, label %sw.bb535 + i32 24, label %sw.bb484 + i32 36, label %sw.bb366 + i32 14, label %sw.bb315 + i32 12, label %sw.bb310 + i32 30, label %sw.bb466 + ] + +sw.bb: ; preds = %entry + unreachable + +sw.bb44: ; preds = %entry + unreachable + +sw.bb53: ; preds = %entry + unreachable + +sw.bb310: ; preds = %entry + unreachable + +sw.bb315: ; preds = %entry + unreachable + +sw.bb361: ; preds = %entry + unreachable + +sw.bb366: ; preds = %entry + unreachable + +sw.bb372: ; preds = %entry + unreachable + +sw.bb377: ; preds = %entry + unreachable + +sw.bb444: ; preds = %entry + unreachable + +sw.bb458: ; preds = %entry + unreachable + +sw.bb466: ; preds = %entry + tail call void undef(i8* undef) #1 + unreachable + +sw.bb484: ; preds = %entry + unreachable + +sw.bb535: ; preds = %entry + unreachable + +sw.default660: ; preds = %entry + unreachable +} + +define void @case5(i1 %in) { +; CHECK: In function: case5 +; CHECK: Switch instruction: +; CHECK: switch i32 %0, label %sw.default [ +; CHECK: Not suitable for a jump table due to possible CTR use in successor: +; CHECK: sw.bb189: ; preds = %if.end, %if.end, %if.end, %if.end +; CHECK: switch i32 %1, label %sw.default.i [ +entry: + br i1 %in, label %cleanup, label %if.end + +if.end: ; preds = %entry + switch i8 undef, label %sw.default [ + i8 0, label %undef_sstr + i8 1, label %sw.bb31 + i8 2, label %sw.bb80 + i8 3, label %sw.bb124 + i8 4, label %sw.bb171 + i8 14, label %sw.bb171 + i8 5, label %sw.bb177 + i8 6, label %sw.bb183 + i8 10, label %sw.bb189 + i8 11, label %sw.bb189 + i8 12, label %sw.bb189 + i8 15, label %sw.bb189 + i8 13, label %sw.bb201 + ] + +undef_sstr: ; preds = %sw.bb31, %if.end + unreachable + +sw.bb31: ; preds = %if.end + br label %undef_sstr + +sw.bb80: ; preds = %if.end + unreachable + +sw.bb124: ; preds = %if.end + unreachable + +sw.bb171: ; preds = %if.end, %if.end + unreachable + +sw.bb177: ; preds = %if.end + unreachable + +sw.bb183: ; preds = %if.end + unreachable + +sw.bb189: ; preds = %if.end, %if.end, %if.end, %if.end + switch i8 undef, label %sw.default.i [ + i8 0, label %sw.bb.i + i8 1, label %sw.bb.i + i8 2, label %sw.bb.i + i8 3, label %sw.bb.i + i8 4, label %sw.bb.i + i8 5, label %sw.bb.i + i8 6, label %sw.bb.i + i8 7, label %sw.bb.i + i8 8, label %sw.bb.i + i8 9, label %sw.bb11.i + i8 10, label %Perl_sv_reftype.exit + i8 11, label %sw.bb28.i + i8 12, label %sw.bb29.i + i8 13, label %sw.bb30.i + i8 14, label %sw.bb31.i + i8 15, label %sw.bb32.i + ] + +sw.bb.i: ; preds = %sw.bb189, %sw.bb189, %sw.bb189, %sw.bb189, %sw.bb189, %sw.bb189, %sw.bb189, %sw.bb189, %sw.bb189 + unreachable + +sw.bb11.i: ; preds = %sw.bb189 + unreachable + +sw.bb28.i: ; preds = %sw.bb189 + unreachable + +sw.bb29.i: ; preds = %sw.bb189 + unreachable + +sw.bb30.i: ; preds = %sw.bb189 + unreachable + +sw.bb31.i: ; preds = %sw.bb189 + unreachable + +sw.bb32.i: ; preds = %sw.bb189 + unreachable + +sw.default.i: ; preds = %sw.bb189 + unreachable + +Perl_sv_reftype.exit: ; preds = %sw.bb189 + unreachable + +sw.bb201: ; preds = %if.end + unreachable + +sw.default: ; preds = %if.end + unreachable + +cleanup: ; preds = %entry + ret void +} Index: test/CodeGen/PowerPC/mcm-5.ll =================================================================== --- test/CodeGen/PowerPC/mcm-5.ll +++ test/CodeGen/PowerPC/mcm-5.ll @@ -17,6 +17,10 @@ i32 4, label %sw.bb1 i32 5, label %sw.bb2 i32 6, label %sw.bb3 + i32 7, label %sw.bb + i32 8, label %sw.bb1 + i32 9, label %sw.bb2 + i32 10, label %sw.bb3 ] sw.default: ; preds = %entry