Index: lib/Transforms/Utils/SimplifyCFG.cpp =================================================================== --- lib/Transforms/Utils/SimplifyCFG.cpp +++ lib/Transforms/Utils/SimplifyCFG.cpp @@ -5489,6 +5489,40 @@ return true; } +/// Another cheap transform: investigate using ctlz as a key to the switch +/// TODO: this transform would benifit from proper range analysis in SwitchToLookupTable +static bool ReduceSwitchRange_ctlz(SwitchInst *SI, + const DataLayout &DL, + const TargetTransformInfo &TTI, + std::vector *Values) { + // There is a big gotcha reagarding ctlz/cttz: when input 0, they + // emit bit width. If not handled this can result in the transform + // being applied multiple times. We can either rotate bit width back to zero, + // by doing an xor or the most significant bit, followed by a rotate-left 1, + // which is expensive, or we can just avoid situation, by checking the range. + unsigned BitWidth = cast(SI->getCondition()->getType())->getIntegerBitWidth(); + if (Values->back() <= BitWidth) + return false; + + unsigned MaxPopCount = 0; + for (auto &V : *Values) { + MaxPopCount = countPopulation(V); + if (MaxPopCount > 1) + break; + } + if (MaxPopCount != 1) + return false; + // TODO a single instruction can be saved on x86 by using cttz over ctlz + // when only ctlz results in a subtraction instruction being added + for (auto &V : *Values) + V = APInt(BitWidth, V).countLeadingZeros(); + // 0 will suddenly become the largest (BitWidth), so we need to sort again. + // Sorting is OK because we either make a transform on all numbers, or do + // not make a transform at all. + std::sort(Values->begin(), Values->end()); + return true; +} + /// Try to transform a switch that has "holes" in it to a contiguous sequence /// of cases. /// @@ -5516,12 +5550,17 @@ return false; // We organize the range to start from 0, if it is not already close. - SmallVector Values; + std::vector Values; for (auto &C : SI->cases()) Values.push_back(C.getCaseValue()->getValue().getLimitedValue()); - llvm::sort(Values); + std::sort(Values.begin(), Values.end()); bool MadeChanges = false; + + bool UseCtlz = ReduceSwitchRange_ctlz(SI, DL, TTI, &Values); + if (UseCtlz) + MadeChanges = true; + // We must first look find the best start point. uint64_t BestDistance = APInt::getMaxValue(CondTy->getIntegerBitWidth()).getLimitedValue() - Values.back() + Values.front() + 1; @@ -5544,9 +5583,6 @@ // This transform can be done speculatively because it is so cheap - it results // in a single rotate operation being inserted. - // FIXME: It's possible that optimizing a switch on powers of two might also - // be beneficial - flag values are often powers of two and we could use a CLZ - // as the key function. unsigned Shift = 64; for (auto &V : Values) { @@ -5581,22 +5617,34 @@ // default case. auto *Ty = cast(SI->getCondition()->getType()); - Builder.SetInsertPoint(SI); - auto *Sub = Builder.CreateSub(SI->getCondition(), ConstantInt::get(Ty, Base)); - Value *Key; - if (Shift) { - auto *ShiftC = ConstantInt::get(Ty, Shift); - Function *Fshr = Intrinsic::getDeclaration(SI->getModule(), Intrinsic::fshr, Ty); - Key = Builder.CreateCall(Fshr, {Sub, Sub, ShiftC}); - } else - Key = Sub; - SI->replaceUsesOfWith(SI->getCondition(), Key); + { + Builder.SetInsertPoint(SI); + Value *Key = SI->getCondition(); + if (UseCtlz) { + auto *Zero = ConstantInt::get(IntegerType::get(Ty->getContext(), 1), 0); + Function *Ctlz = Intrinsic::getDeclaration(SI->getModule(), Intrinsic::ctlz, Ty); + Key = Builder.CreateCall(Ctlz, {Key, Zero}); + } + if (Base) + Key = Builder.CreateSub(Key, ConstantInt::get(Ty, Base)); + if (Shift) { + auto *ShiftC = ConstantInt::get(Ty, Shift); + Function *Fshr = Intrinsic::getDeclaration(SI->getModule(), Intrinsic::fshr, Ty); + Key = Builder.CreateCall(Fshr, {Key, Key, ShiftC}); + } + SI->replaceUsesOfWith(SI->getCondition(), Key); + } for (auto Case : SI->cases()) { auto *Orig = Case.getCaseValue(); - auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base); + uint64_t Zeros; + if (UseCtlz) + Zeros = Orig->getValue().countLeadingZeros(); + else + Zeros = Orig->getValue().getLimitedValue(); + auto Sub = (APInt(BitWidth, Zeros) - Base).getLimitedValue(); Case.setValue( - cast(ConstantInt::get(Ty, Sub.lshr(Shift)))); + cast(ConstantInt::get(Ty, APInt(BitWidth, Sub >> Shift)))); } return true; } Index: test/Transforms/SimplifyCFG/rangereduce.ll =================================================================== --- test/Transforms/SimplifyCFG/rangereduce.ll +++ test/Transforms/SimplifyCFG/rangereduce.ll @@ -82,8 +82,7 @@ define i32 @test3(i32 %a) { ; CHECK-LABEL: @test3( ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[A:%.*]], 97 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.fshr.i32(i32 [[TMP1]], i32 [[TMP1]], i32 0) -; CHECK-NEXT: switch i32 [[TMP2]], label [[DEF:%.*]] [ +; CHECK-NEXT: switch i32 [[TMP1]], label [[DEF:%.*]] [ ; CHECK-NEXT: i32 0, label [[ONE:%.*]] ; CHECK-NEXT: i32 1, label [[TWO:%.*]] ; CHECK-NEXT: i32 2, label [[THREE:%.*]] @@ -118,8 +117,7 @@ define i32 @test4(i32 %a) { ; CHECK-LABEL: @test4( ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[A:%.*]], 97 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.fshr.i32(i32 [[TMP1]], i32 [[TMP1]], i32 0) -; CHECK-NEXT: switch i32 [[TMP2]], label [[DEF:%.*]] [ +; CHECK-NEXT: switch i32 [[TMP1]], label [[DEF:%.*]] [ ; CHECK-NEXT: i32 0, label [[ONE:%.*]] ; CHECK-NEXT: i32 5, label [[TWO:%.*]] ; CHECK-NEXT: i32 8, label [[THREE:%.*]] @@ -156,8 +154,7 @@ define i32 @test5(i32 %a) { ; CHECK-LABEL: @test5( ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[A:%.*]], 97 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.fshr.i32(i32 [[TMP1]], i32 [[TMP1]], i32 0) -; CHECK-NEXT: switch i32 [[TMP2]], label [[DEF:%.*]] [ +; CHECK-NEXT: switch i32 [[TMP1]], label [[DEF:%.*]] [ ; CHECK-NEXT: i32 0, label [[ONE:%.*]] ; CHECK-NEXT: i32 5, label [[TWO:%.*]] ; CHECK-NEXT: i32 10, label [[THREE:%.*]] @@ -302,9 +299,8 @@ define i32 @test9(i32 %a) { ; CHECK-LABEL: @test9( -; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[A:%.*]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.fshr.i32(i32 [[TMP1]], i32 [[TMP1]], i32 1) -; CHECK-NEXT: switch i32 [[TMP2]], label [[DEF:%.*]] [ +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A:%.*]], i32 [[A]], i32 1) +; CHECK-NEXT: switch i32 [[TMP1]], label [[DEF:%.*]] [ ; CHECK-NEXT: i32 9, label [[ONE:%.*]] ; CHECK-NEXT: i32 10, label [[TWO:%.*]] ; CHECK-NEXT: i32 3, label [[THREE:%.*]] Index: test/Transforms/SimplifyCFG/switch-simplify-range.ll =================================================================== --- test/Transforms/SimplifyCFG/switch-simplify-range.ll +++ test/Transforms/SimplifyCFG/switch-simplify-range.ll @@ -44,35 +44,16 @@ define i64 @switch_clz1(i16 %a) optsize #0 { ; CHECK-LABEL: @switch_clz1( ; CHECK-NEXT: Entry: -; CHECK-NEXT: switch i16 [[A:%.*]], label [[SWITCHELSE:%.*]] [ -; CHECK-NEXT: i16 2, label [[SWITCHPRONG:%.*]] -; CHECK-NEXT: i16 4, label [[SWITCHPRONG1:%.*]] -; CHECK-NEXT: i16 8, label [[SWITCHPRONG2:%.*]] -; CHECK-NEXT: i16 1, label [[SWITCHPRONG3:%.*]] -; CHECK-NEXT: i16 64, label [[SWITCHPRONG6:%.*]] -; CHECK-NEXT: i16 128, label [[SWITCHPRONG7:%.*]] -; CHECK-NEXT: i16 16, label [[SWITCHPRONG5:%.*]] -; CHECK-NEXT: i16 32, label [[SWITCHPRONG4:%.*]] -; CHECK-NEXT: ] +; CHECK-NEXT: [[TMP0:%.*]] = call i16 @llvm.ctlz.i16(i16 [[A:%.*]], i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = sub i16 [[TMP0]], 8 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i16 [[TMP1]], 8 +; CHECK-NEXT: br i1 [[TMP2]], label [[SWITCH_LOOKUP:%.*]], label [[SWITCHELSE:%.*]] +; CHECK: switch.lookup: +; CHECK-NEXT: [[SWITCH_GEP:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* @switch.table.switch_clz1, i32 0, i16 [[TMP1]] +; CHECK-NEXT: [[SWITCH_LOAD:%.*]] = load i64, i64* [[SWITCH_GEP]] +; CHECK-NEXT: ret i64 [[SWITCH_LOAD]] ; CHECK: SwitchElse: -; CHECK-NEXT: [[MERGE:%.*]] = phi i64 [ 10, [[ENTRY:%.*]] ], [ 6, [[SWITCHPRONG]] ], [ 3, [[SWITCHPRONG1]] ], [ 35, [[SWITCHPRONG2]] ], [ 31, [[SWITCHPRONG3]] ], [ 53, [[SWITCHPRONG4]] ], [ 51, [[SWITCHPRONG5]] ], [ 41, [[SWITCHPRONG6]] ], [ 34, [[SWITCHPRONG7]] ] -; CHECK-NEXT: ret i64 [[MERGE]] -; CHECK: SwitchProng: -; CHECK-NEXT: br label [[SWITCHELSE]] -; CHECK: SwitchProng1: -; CHECK-NEXT: br label [[SWITCHELSE]] -; CHECK: SwitchProng2: -; CHECK-NEXT: br label [[SWITCHELSE]] -; CHECK: SwitchProng3: -; CHECK-NEXT: br label [[SWITCHELSE]] -; CHECK: SwitchProng4: -; CHECK-NEXT: br label [[SWITCHELSE]] -; CHECK: SwitchProng5: -; CHECK-NEXT: br label [[SWITCHELSE]] -; CHECK: SwitchProng6: -; CHECK-NEXT: br label [[SWITCHELSE]] -; CHECK: SwitchProng7: -; CHECK-NEXT: br label [[SWITCHELSE]] +; CHECK-NEXT: ret i64 10 ; Entry: switch i16 %a, label %SwitchElse [ @@ -108,26 +89,15 @@ define i64 @switch_clz2(i8 %a) optsize #0 { ; CHECK-LABEL: @switch_clz2( ; CHECK-NEXT: Entry: -; CHECK-NEXT: switch i8 [[A:%.*]], label [[SWITCHELSE:%.*]] [ -; CHECK-NEXT: i8 -128, label [[SWITCHPRONG2:%.*]] -; CHECK-NEXT: i8 64, label [[SWITCHPRONG3:%.*]] -; CHECK-NEXT: i8 32, label [[SWITCHPRONG6:%.*]] -; CHECK-NEXT: i8 1, label [[SWITCHPRONG7:%.*]] -; CHECK-NEXT: i8 0, label [[SWITCHPRONG5:%.*]] -; CHECK-NEXT: ] +; CHECK-NEXT: [[TMP0:%.*]] = call i8 @llvm.ctlz.i8(i8 [[A:%.*]], i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i8 [[TMP0]], 9 +; CHECK-NEXT: br i1 [[TMP1]], label [[SWITCH_LOOKUP:%.*]], label [[SWITCHPRONG2:%.*]] +; CHECK: switch.lookup: +; CHECK-NEXT: [[SWITCH_GEP:%.*]] = getelementptr inbounds [9 x i64], [9 x i64]* @switch.table.switch_clz2, i32 0, i8 [[TMP0]] +; CHECK-NEXT: [[SWITCH_LOAD:%.*]] = load i64, i64* [[SWITCH_GEP]] +; CHECK-NEXT: ret i64 [[SWITCH_LOAD]] ; CHECK: SwitchProng2: -; CHECK-NEXT: [[MERGE:%.*]] = phi i64 [ 35, [[ENTRY:%.*]] ], [ 31, [[SWITCHPRONG3]] ], [ 41, [[SWITCHPRONG6]] ], [ 40, [[SWITCHPRONG5]] ], [ 43, [[SWITCHPRONG7]] ], [ 12, [[SWITCHELSE]] ] -; CHECK-NEXT: ret i64 [[MERGE]] -; CHECK: SwitchProng3: -; CHECK-NEXT: br label [[SWITCHPRONG2]] -; CHECK: SwitchProng6: -; CHECK-NEXT: br label [[SWITCHPRONG2]] -; CHECK: SwitchProng5: -; CHECK-NEXT: br label [[SWITCHPRONG2]] -; CHECK: SwitchProng7: -; CHECK-NEXT: br label [[SWITCHPRONG2]] -; CHECK: SwitchElse: -; CHECK-NEXT: br label [[SWITCHPRONG2]] +; CHECK-NEXT: ret i64 12 ; Entry: switch i8 %a, label %SwitchElse [