Index: lib/Transforms/Utils/SimplifyCFG.cpp =================================================================== --- lib/Transforms/Utils/SimplifyCFG.cpp +++ lib/Transforms/Utils/SimplifyCFG.cpp @@ -5128,13 +5128,16 @@ ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize, const TargetTransformInfo &TTI, const DataLayout &DL, const SmallDenseMap &ResultTypes) { - if (SI->getNumCases() > TableSize || TableSize >= UINT64_MAX / 10) + if (SI->getNumCases() > TableSize || TableSize >= UINT64_MAX / (3 * 8)) return false; // TableSize overflowed, or mul below might overflow. bool AllTablesFitInRegister = true; bool HasIllegalType = false; + bool NoBiggerThanI8 = true; + unsigned BiggestTypeSize = 0; for (const auto &I : ResultTypes) { Type *Ty = I.second; + unsigned TySize = DL.getTypeAllocSize(Ty); // Saturate this flag to true. HasIllegalType = HasIllegalType || !TTI.isTypeLegal(Ty); @@ -5144,9 +5147,13 @@ AllTablesFitInRegister && SwitchLookupTable::WouldFitInRegister(DL, TableSize, Ty); - // If both flags saturate, we're done. NOTE: This *only* works with - // saturating flags, and all flags have to saturate first due to the - // non-deterministic behavior of iterating over a dense map. + // Saturate this flag to false. + NoBiggerThanI8 = NoBiggerThanI8 && (TySize == 1); + + if (TySize > BiggestTypeSize) + BiggestTypeSize = TySize; + + // If these two flags saturate, we're done. if (HasIllegalType && !AllTablesFitInRegister) break; } @@ -5159,10 +5166,24 @@ if (HasIllegalType) return false; - // The table density should be at least 40%. This is the same criterion as for - // jump tables, see SelectionDAGBuilder::handleJTSwitchCase. + // If the table only contains i8s or smaller, it has a bounded size of + // 256 times the largest legal size, and will be more performant with a lookup table. + if (NoBiggerThanI8 && !SI->getFunction()->hasOptSize()) + return true; + + // If the table is smaller, always use it + if (TableSize * BiggestTypeSize + 14 < + // Table Size, including empty space, plus header size + SI->getNumCases() * 14) // size of cmp jmp mov ret on x86_64. + return true; + + // Space is more important than performance when using -Os + if (SI->getFunction()->hasOptSize()) + return false; + + // The table density should be at least 33% for 64-bit integers. // FIXME: Find the best cut-off. - return SI->getNumCases() * 10 >= TableSize * 4; + return SI->getNumCases() * 3 * 8 >= (TableSize * BiggestTypeSize); } /// Try to reuse the switch table index compare. Following pattern: @@ -5248,6 +5269,11 @@ } } +// FIXME Move this function up here when commiting. This makes the patch easier to read. +static bool ReduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder, + const DataLayout &DL, + const TargetTransformInfo &TTI); + /// If the switch is only used to initialize one or more phi nodes in a common /// successor block with different constant values, replace the switch with /// lookup tables. @@ -5343,6 +5369,36 @@ DefaultResults[PHI] = Result; } + // Compute the maximum table size representable by the integer type we are + // switching upon. + unsigned CaseSize = MaxCaseVal->getType()->getPrimitiveSizeInBits(); + uint64_t MaxTableSize = CaseSize > 63 ? UINT64_MAX : 1ULL << CaseSize; + assert(MaxTableSize >= TableSize && + "It is impossible for a switch to have more entries than the max " + "representable value of its input integer type's size."); + + // If the table is only a u8 and we do not have to check for the default case, + // extend the table so we can get rid of the branch. + if (MaxTableSize <= 256 && HasDefaultResults && !SI->getFunction()->hasOptSize()) { + TableSize = MaxTableSize; + // Un-rotate and un-xor now that we are covering the whole range, + ConstantInt *CAdd = nullptr, *CXor = nullptr; + Value *V; + if (match(SI->getCondition(), m_Add(m_Value(V), m_ConstantInt(CAdd))) || + match(SI->getCondition(), m_Xor(m_Value(V), m_ConstantInt(CXor)))) { + for (auto Case : SI->cases()) { + auto *Orig = Case.getCaseValue(); + auto Sub = CAdd ? Orig->getValue() - CAdd->getValue() : Orig->getValue(); + auto Xor = (CXor ? Sub ^ CXor->getValue() : Sub); + Case.setValue(cast(ConstantInt::get(MaxCaseVal->getContext(), Xor))); + } + SI->setCondition(V); + return true; // We will get called again + } + // Call this from in here, because we need the context necessary for this if/else + } else if (ReduceSwitchRange(SI, Builder, DL, TTI)) + return true; // We will get called again + if (!ShouldBuildLookupTable(SI, TableSize, TTI, DL, ResultTypes)) return false; @@ -5351,18 +5407,9 @@ BasicBlock *LookupBB = BasicBlock::Create( Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest); - // Compute the table index value. Builder.SetInsertPoint(SI); Value *TableIndex = SI->getCondition(); - // Compute the maximum table size representable by the integer type we are - // switching upon. - unsigned CaseSize = MaxCaseVal->getType()->getPrimitiveSizeInBits(); - uint64_t MaxTableSize = CaseSize > 63 ? UINT64_MAX : 1ULL << CaseSize; - assert(MaxTableSize >= TableSize && - "It is impossible for a switch to have more entries than the max " - "representable value of its input integer type's size."); - // If the default destination is unreachable, or if the lookup table covers // all values of the conditional variable, branch directly to the lookup table // BB. Otherwise, check that the condition is within the case range. @@ -5652,9 +5699,6 @@ if (Options.ForwardSwitchCondToPhi && ForwardSwitchConditionToPHI(SI)) return requestResimplify(); - if (ReduceSwitchRange(SI, Builder, DL, TTI)) - return requestResimplify(); - // The conversion from switch to lookup tables results in difficult-to-analyze // code and makes pruning branches much harder. This is a problem if the // switch expression itself can still be restricted as a result of inlining or @@ -5664,6 +5708,10 @@ SwitchToLookupTable(SI, Builder, DL, TTI)) return requestResimplify(); + // This is also called within SwitchToLookupTable + if (ReduceSwitchRange(SI, Builder, DL, TTI)) + return requestResimplify(); + return false; } Index: test/Transforms/SimplifyCFG/switch-genfori8.ll =================================================================== --- /dev/null +++ test/Transforms/SimplifyCFG/switch-genfori8.ll @@ -0,0 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; without the -O2 opt will not generate lookup tables +; RUN: opt -S -simplifycfg -O2 < %s | FileCheck %s +; Using a Zig driver https://gist.github.com/shawnl/8137f62f7dbcfd539f6cf1925387cd38 +;after-patch, covered lookup table: 509.8MiB/sec +;lookup table, not covered, only valid digits to prime the branch predictor: 437.8MiB/sec +;lookup table, not covered, random bytes: 242.0MiB/sec +;before-patch, no lookup table: 205.4MiB/sec + +; ModuleID = 'chartodigit.c' +source_filename = "chartodigit.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +; Function Attrs: norecurse nounwind readnone uwtable +define dso_local zeroext i8 @char_to_digit(i8 zeroext) local_unnamed_addr #0 { +; CHECK-LABEL: @char_to_digit( +; CHECK-NEXT: switch.lookup: +; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0:%.*]] to i64 +; CHECK-NEXT: [[SWITCH_GEP:%.*]] = getelementptr inbounds [256 x i8], [256 x i8]* @switch.table.char_to_digit, i64 0, i64 [[TMP1]] +; CHECK-NEXT: [[SWITCH_LOAD:%.*]] = load i8, i8* [[SWITCH_GEP]], align 1 +; CHECK-NEXT: ret i8 [[SWITCH_LOAD]] +; + switch i8 %0, label %17 [ + i8 48, label %18 + i8 49, label %2 + i8 50, label %3 + i8 51, label %4 + i8 52, label %5 + i8 53, label %6 + i8 54, label %7 + i8 55, label %8 + i8 56, label %9 + i8 57, label %10 + i8 97, label %11 + i8 98, label %12 + i8 99, label %13 + i8 100, label %14 + i8 101, label %15 + i8 102, label %16 + ] + +;