Index: clang/test/Frontend/optimization-remark-analysis.c =================================================================== --- clang/test/Frontend/optimization-remark-analysis.c +++ clang/test/Frontend/optimization-remark-analysis.c @@ -1,5 +1,5 @@ -// RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -emit-llvm -Rpass-analysis -S %s -o - 2>&1 | FileCheck %s --check-prefix=RPASS -// RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -emit-llvm -S %s -o - 2>&1 | FileCheck %s +// RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -mllvm -remove-switch-blocks=false -emit-llvm -Rpass-analysis -S %s -o - 2>&1 | FileCheck %s --check-prefix=RPASS +// RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -mllvm -remove-switch-blocks=false -emit-llvm -S %s -o - 2>&1 | FileCheck %s // RPASS: {{.*}}:7:8: remark: loop not vectorized: loop contains a switch statement // CHECK-NOT: {{.*}}:7:8: remark: loop not vectorized: loop contains a switch statement Index: llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h =================================================================== --- llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h +++ llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h @@ -29,6 +29,7 @@ bool SinkCommonInsts = false; bool SimplifyCondBranch = true; bool FoldTwoEntryPHINode = true; + unsigned SwitchRemovalThreshold = 0; AssumptionCache *AC = nullptr; @@ -70,6 +71,11 @@ FoldTwoEntryPHINode = B; return *this; } + + SimplifyCFGOptions &switchRemovalThreshold(int I) { + SwitchRemovalThreshold = I; + return *this; + } }; } // namespace llvm Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -256,6 +256,17 @@ cl::desc("Run synthetic function entry count generation " "pass")); +static cl::opt + RemoveSwitchBlocks("remove-switch-blocks", cl::init(true), cl::Hidden, + cl::desc("Convert switch blocks into a branch sequence " + "prior to vectorization.")); + +// This value determines the point at which we stop removing switch statements +// before the vectorizer pass. Removing switch blocks and replacing them with +// compares and branches allows architectures that support predication to +// vectorize. +static const int RemoveSwitchCaseThreshold = 4; + static const Regex DefaultAliasRegex( "^(default|thinlto-pre-link|thinlto|lto-pre-link|lto)<(O[0123sz])>$"); @@ -1201,6 +1212,10 @@ /// TODO: Should LTO cause any differences to this set of passes? void PassBuilder::addVectorPasses(OptimizationLevel Level, FunctionPassManager &FPM, bool IsFullLTO) { + if (RemoveSwitchBlocks) + FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().switchRemovalThreshold( + RemoveSwitchCaseThreshold))); + FPM.addPass(LoopVectorizePass( LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); Index: llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp =================================================================== --- llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -55,6 +55,11 @@ "bonus-inst-threshold", cl::Hidden, cl::init(1), cl::desc("Control the number of bonus instructions (default = 1)")); +static cl::opt UserSwitchRemovalThreshold( + "switch-removal-threshold", cl::Hidden, cl::init(0), + cl::desc("Set the threshold for the number of switch cases where we" + "convert switch blocks to branches and compares")); + static cl::opt UserKeepLoops( "keep-loops", cl::Hidden, cl::init(true), cl::desc("Preserve canonical loop structure (default = true)")); @@ -308,6 +313,8 @@ Options.HoistCommonInsts = UserHoistCommonInsts; if (UserSinkCommonInsts.getNumOccurrences()) Options.SinkCommonInsts = UserSinkCommonInsts; + if (UserSwitchRemovalThreshold.getNumOccurrences()) + Options.SwitchRemovalThreshold = UserSwitchRemovalThreshold; } SimplifyCFGPass::SimplifyCFGPass() : Options() { Index: llvm/lib/Transforms/Utils/SimplifyCFG.cpp =================================================================== --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -6141,6 +6141,102 @@ return true; } +// Attempt to turn a switch statement into a series of conditional branches +// which we may later be able to vectorize. +static bool TurnSmallSwitchIntoICmps(SwitchInst *SI, IRBuilder<> &Builder) { + assert(SI->getNumCases() > 1 && "Degenerate switch?"); + + // Check to see if we have a genuine default, reachable block with executable + // instructions in them. + bool HasDefault = + !isa(SI->getDefaultDest()->getFirstNonPHIOrDbg()); + + BasicBlock *DefaultBlock = HasDefault ? SI->getDefaultDest() : nullptr; + BasicBlock *BB = SI->getParent(); + + // Make sure each of the cases has a unique destination + for (auto Case : SI->cases()) + if (!SI->findCaseDest(Case.getCaseSuccessor())) + return false; + + // Record the total weighting for this switch block. + uint64_t TotalWeight = 0; + SmallVector Weights; + if (HasBranchWeights(SI)) { + GetBranchWeights(SI, Weights); + if (Weights.size() == (SI->getNumCases() + 1)) + for (auto W : Weights) + TotalWeight += W; + } + + BasicBlock *FalseDest = nullptr; + uint64_t FalseWeight = TotalWeight; + for (auto CI : SI->cases()) { + BasicBlock *TrueDest = CI.getCaseSuccessor(); + Value *Cmp = + Builder.CreateICmpEQ(SI->getCondition(), CI.getCaseValue(), "switch"); + + // Walk through PHIs in TrueDest and see which ones came + // from the switch block, then remap them. + if (FalseDest) { + for (PHINode &PN : TrueDest->phis()) { + for (auto PB : PN.blocks()) { + if (PB == BB) { + Value *V = PN.getIncomingValueForBlock(BB); + PN.removeIncomingValue(BB, false); + PN.addIncoming(V, FalseDest); + } + } + } + } + + BasicBlock *MoveAfter = FalseDest ? FalseDest : BB; + FalseDest = BasicBlock::Create(BB->getContext(), BB->getName() + ".switch", + BB->getParent(), BB); + FalseDest->moveAfter(MoveAfter); + + Instruction *I = Builder.CreateCondBr(Cmp, TrueDest, FalseDest); + // Update weight for the newly-created conditional branch. + // We set the weight of the TrueDest to the weight for the successor + // of the current case. The FalseDest is assigned the remaining total + // weight, minus the weight assigned to TrueDest. + if (TotalWeight) { + int Index = CI.getSuccessorIndex(); + FalseWeight -= Weights[Index]; + setBranchWeights(I, Weights[Index], FalseWeight); + } + Builder.SetInsertPoint(FalseDest); + } + + if (DefaultBlock) { + Builder.CreateBr(DefaultBlock); + + // The block that we jump to may have had some PHIs that came + // from the block containing the switch statement. Now that we + // are removing the switch statement we need to fix up the PHIs. + + // Walk through PHIs in DefaultBlock and see which ones came + // from the switch block, then remap them. + for (PHINode &PN : DefaultBlock->phis()) { + for (auto PB : PN.blocks()) { + if (PB == BB) { + Value *V = PN.getIncomingValueForBlock(BB); + PN.removeIncomingValue(BB, false); + PN.addIncoming(V, FalseDest); + } + } + } + } else + Builder.CreateUnreachable(); + + // Drop the switch. + SI->eraseFromParent(); + + Builder.SetInsertPoint(BB); + + return true; +} + bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { BasicBlock *BB = SI->getParent(); @@ -6163,8 +6259,14 @@ return requestResimplify(); } + unsigned NumCases = SI->getNumCases(); + bool RemoveSwitches = Options.SwitchRemovalThreshold >= NumCases; + + if (RemoveSwitches && TurnSmallSwitchIntoICmps(SI, Builder)) + return simplifyCFG(BB, TTI, DTU, Options) | true; + // Try to transform the switch into an icmp and a branch. - if (TurnSwitchRangeIntoICmp(SI, Builder)) + if (!RemoveSwitches && TurnSwitchRangeIntoICmp(SI, Builder)) return requestResimplify(); // Remove unreachable cases. @@ -6412,16 +6514,18 @@ if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder)) return requestResimplify(); - // This block must be empty, except for the setcond inst, if it exists. - // Ignore dbg and pseudo intrinsics. - auto I = BB->instructionsWithoutDebug(true).begin(); - if (&*I == BI) { - if (FoldValueComparisonIntoPredecessors(BI, Builder)) - return requestResimplify(); - } else if (&*I == cast(BI->getCondition())) { - ++I; - if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder)) - return requestResimplify(); + if (Options.SwitchRemovalThreshold == 0) { + // This block must be empty, except for the setcond inst, if it exists. + // Ignore dbg and pseudo intrinsics. + auto I = BB->instructionsWithoutDebug(true).begin(); + if (&*I == BI) { + if (FoldValueComparisonIntoPredecessors(BI, Builder)) + return requestResimplify(); + } else if (&*I == cast(BI->getCondition())) { + ++I; + if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder)) + return requestResimplify(); + } } } Index: llvm/test/Other/new-pm-defaults.ll =================================================================== --- llvm/test/Other/new-pm-defaults.ll +++ llvm/test/Other/new-pm-defaults.ll @@ -216,6 +216,7 @@ ; CHECK-O-NEXT: Running pass: LoopRotatePass ; CHECK-O-NEXT: Running pass: LoopDistributePass ; CHECK-O-NEXT: Running pass: InjectTLIMappings +; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis Index: llvm/test/Other/new-pm-lto-defaults.ll =================================================================== --- llvm/test/Other/new-pm-lto-defaults.ll +++ llvm/test/Other/new-pm-lto-defaults.ll @@ -105,6 +105,7 @@ ; CHECK-O23SZ-NEXT: Running pass: LoopDeletionPass on Loop ; CHECK-O23SZ-NEXT: Running pass: LoopFullUnrollPass on Loop ; CHECK-O23SZ-NEXT: Running pass: LoopDistributePass on foo +; CHECK-O23SZ-NEXT: Running pass: SimplifyCFGPass ; CHECK-O23SZ-NEXT: Running pass: LoopVectorizePass on foo ; CHECK-O23SZ-NEXT: Running analysis: BlockFrequencyAnalysis on foo ; CHECK-O23SZ-NEXT: Running analysis: BranchProbabilityAnalysis on foo Index: llvm/test/Other/new-pm-thinlto-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-defaults.ll +++ llvm/test/Other/new-pm-thinlto-defaults.ll @@ -197,6 +197,7 @@ ; CHECK-POSTLINK-O-NEXT: Running pass: LoopRotatePass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopDistributePass ; CHECK-POSTLINK-O-NEXT: Running pass: InjectTLIMappings +; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis ; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis Index: llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll +++ llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll @@ -168,6 +168,7 @@ ; CHECK-O-NEXT: Running pass: LoopRotatePass ; CHECK-O-NEXT: Running pass: LoopDistributePass ; CHECK-O-NEXT: Running pass: InjectTLIMappings +; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis Index: llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -180,6 +180,7 @@ ; CHECK-O-NEXT: Running pass: LoopRotatePass ; CHECK-O-NEXT: Running pass: LoopDistributePass ; CHECK-O-NEXT: Running pass: InjectTLIMappings +; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-remove-switches.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-remove-switches.ll @@ -0,0 +1,277 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -O3 -loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve -scalable-vectorization=on -S | FileCheck %s + +define void @switch(i32* noalias %a, i32* noalias %b, i32* noalias %c, i64 %N) #0 { +; CHECK-LABEL: @switch( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 4, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 2, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP12:%.*]] = xor [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP13:%.*]] = select [[TMP8]], shufflevector ( insertelement ( poison, i1 false, i32 0), poison, zeroinitializer), [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = xor [[TMP10]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP15:%.*]] = select [[TMP13]], [[TMP14]], shufflevector ( insertelement ( poison, i1 false, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP11]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP16]], i32 4, [[TMP15]], poison) +; CHECK-NEXT: [[TMP17:%.*]] = mul nsw [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP18:%.*]] = add nsw [[TMP17]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP19:%.*]] = select [[TMP8]], shufflevector ( insertelement ( poison, i1 false, i32 0), poison, zeroinitializer), [[TMP9]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP11]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP20]], i32 4, [[TMP19]], poison) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD6]] +; CHECK-NEXT: [[PREDPHI7:%.*]] = select [[TMP15]], [[TMP18]], shufflevector ( insertelement ( poison, i32 2, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP21:%.*]] = mul nsw [[PREDPHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP22:%.*]] = add nsw [[TMP21]], [[PREDPHI7]] +; CHECK-NEXT: [[TMP23:%.*]] = or [[TMP19]], [[TMP15]] +; CHECK-NEXT: [[TMP24:%.*]] = select [[TMP13]], [[TMP10]], shufflevector ( insertelement ( poison, i1 false, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[PREDPHI8:%.*]] = select [[TMP24]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer), [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP26:%.*]] = or [[TMP24]], [[TMP23]] +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP25]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP27]], i32 4, [[TMP26]], poison) +; CHECK-NEXT: [[TMP28:%.*]] = mul nsw [[WIDE_MASKED_LOAD9]], [[PREDPHI8]] +; CHECK-NEXT: [[TMP29:%.*]] = add nsw [[TMP28]], [[PREDPHI8]] +; CHECK-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP25]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP30]], i32 4, [[TMP8]], poison) +; CHECK-NEXT: [[PREDPHI11:%.*]] = select [[TMP26]], [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD10]] +; CHECK-NEXT: [[PREDPHI12:%.*]] = select [[TMP26]], [[TMP29]], shufflevector ( insertelement ( poison, i32 4, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP31:%.*]] = mul nsw [[PREDPHI11]], [[PREDPHI11]] +; CHECK-NEXT: [[TMP32:%.*]] = add nsw [[TMP31]], [[PREDPHI12]] +; CHECK-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP6]] to * +; CHECK-NEXT: store [[TMP32]], * [[TMP33]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], {{.*}} +; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}} +; CHECK-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L4:%.*]] ], [ {{.*}}, %for.body.preheader ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] +; CHECK-NEXT: [[TMP35:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: switch i32 [[TMP35]], label [[FOR_BODY_SWITCH5:%.*]] [ +; CHECK-NEXT: i32 4, label [[FOR_BODY_L4_CRIT_EDGE:%.*]] +; CHECK-NEXT: i32 2, label [[FOR_BODY_L2_CRIT_EDGE:%.*]] +; CHECK-NEXT: i32 3, label [[L3:%.*]] +; CHECK-NEXT: ] + +entry: + br label %for.body + +for.body: + %i = phi i64 [ %inc, %L4 ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i + %0 = load i32, i32* %arrayidx + switch i32 %0, label %L1 [ + i32 4, label %L4 + i32 2, label %L2 + i32 3, label %L3 + ] + +L1: + %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i + %1 = load i32, i32* %arrayidx5 + %mul = mul nsw i32 %1, %0 + %add = add nsw i32 %mul, %0 + store i32 %add, i32* %arrayidx + br label %L2 + +L2: + %2 = phi i32 [ 2, %for.body ], [ %add, %L1 ] + %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i + %3 = load i32, i32* %arrayidx7 + %mul9 = mul nsw i32 %3, %3 + %add11 = add nsw i32 %2, %mul9 + store i32 %add11, i32* %arrayidx + br label %L3 + +L3: + %4 = phi i32 [ 3, %for.body ], [ %add11, %L2 ] + %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 %i + %5 = load i32, i32* %arrayidx13 + %mul14 = mul nsw i32 %5, %4 + %add16 = add nsw i32 %mul14, %4 + store i32 %add16, i32* %arrayidx + br label %L4 + +L4: + %6 = phi i32 [ 4, %for.body ], [ %add16, %L3 ] + %arrayidx17 = getelementptr inbounds i32, i32* %c, i64 %i + %7 = load i32, i32* %arrayidx17 + %mul19 = mul nsw i32 %7, %7 + %add21 = add nsw i32 %6, %mul19 + store i32 %add21, i32* %arrayidx + %inc = add nuw nsw i64 %i, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +define void @switch_VF1_UF2(i32* noalias %a, i32* noalias %b, i32* noalias %c, i64 %N) #0 { +; CHECK-LABEL: @switch_VF1_UF2( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[INDUCTION4:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> , <2 x i32> [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_LOAD_CONTINUE:%.*]], label [[PRED_LOAD_IF:%.*]] +; CHECK: pred.load.if: +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK: pred.load.continue: +; CHECK-NEXT: [[TMP10:%.*]] = phi i32 [ poison, %vector.body ], [ [[TMP9]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 +; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_LOAD_CONTINUE6]], label [[PRED_LOAD_IF5:%.*]] +; CHECK: pred.load.if5: +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION4]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; CHECK: pred.load.continue6: +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP17:%.*]] = mul nsw <2 x i32> [[TMP16]], +; CHECK-NEXT: [[TMP18:%.*]] = add nsw <2 x i32> [[TMP17]], [[TMP6]] +; CHECK-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> , <2 x i32> [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>* +; CHECK-NEXT: [[TMP22:%.*]] = load <2 x i32>, <2 x i32>* [[TMP21]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = shl nsw <2 x i32> [[TMP22]], +; CHECK-NEXT: [[TMP24:%.*]] = add nsw <2 x i32> [[TMP23]], [[TMP19]] +; CHECK-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP24]], <2 x i32>* [[TMP25]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}} +; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ {{.*}}, %for.body.preheader ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[SWITCH:%.*]] = icmp eq i32 [[TMP27]], 3 +; CHECK-NEXT: br i1 [[SWITCH]], label [[L3]], label [[FOR_BODY_SWITCH:%.*]] + +entry: + br label %for.body + +for.body: + %i = phi i64 [ %inc, %L3 ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i + %0 = load i32, i32* %arrayidx + %switch = icmp eq i32 %0, 3 + br i1 %switch, label %L3, label %for.body.switch + +for.body.switch: + %switch1 = icmp eq i32 %0, 2 + br i1 %switch1, label %L2, label %for.body.switch2 + +for.body.switch2: + %add = mul nsw i32 %0, 3 + store i32 %add, i32* %arrayidx + br label %L2 + +L2: + %1 = phi i32 [ %add, %for.body.switch2 ], [ %0, %for.body.switch ] + %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i + %2 = load i32, i32* %arrayidx5 + %mul6 = mul nsw i32 %2, 3 + %add8 = add nsw i32 %1, %mul6 + store i32 %add8, i32* %arrayidx + br label %L3 + +L3: + %3 = phi i32 [ %0, %for.body ], [ %add8, %L2 ] + %arrayidx9 = getelementptr inbounds i32, i32* %c, i64 %i + %4 = load i32, i32* %arrayidx9 + %mul10 = shl nsw i32 %4, 2 + %add12 = add nsw i32 %3, %mul10 + store i32 %add12, i32* %arrayidx + %inc = add nuw nsw i64 %i, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret void +} + +; This loop will not vectorize due to unsafe FP ops, ensure the switch statement is created again in for.body +define float @switch_no_vectorize(i32* noalias %a, i32* noalias %b, i32* noalias %c, float %val, i64 %N) { +; CHECK-LABEL: @switch_no_vectorize( +; CHECK-NOT: vector.body +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SUM_033:%.*]] = phi float [ [[CONV20:%.*]], [[L3]] ], [ 2.000000e+00, [[ENTRY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: switch i32 [[TMP0]], label [[FOR_BODY_SWITCH2:%.*]] [ +; CHECK-NEXT: i32 3, label [[L3]] +; CHECK-NEXT: i32 2, label [[L2:%.*]] +; CHECK-NEXT: ] + +entry: + br label %for.body + +for.body: + %i = phi i64 [ %inc, %L3 ], [ 0, %entry ] + %sum.033 = phi float [ %conv20, %L3 ], [ 2.000000e+00, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i + %0 = load i32, i32* %arrayidx + switch i32 %0, label %L1 [ + i32 3, label %L3 + i32 2, label %L2 + ] + +L1: + %conv = sitofp i32 %0 to float + %conv4 = fpext float %conv to double + %add = fadd double %conv4, 1.000000e+00 + %conv5 = fpext float %sum.033 to double + %mul = fmul double %add, %conv5 + %conv6 = fptrunc double %mul to float + br label %L2 + +L2: + %sum.1 = phi float [ %conv6, %L1 ], [ %sum.033, %for.body ] + %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i + %1 = load i32, i32* %arrayidx7 + %conv8 = sitofp i32 %1 to float + %conv9 = fpext float %conv8 to double + %add10 = fadd double %conv9, 2.000000e+00 + %conv11 = fpext float %sum.1 to double + %mul12 = fmul double %add10, %conv11 + %conv13 = fptrunc double %mul12 to float + br label %L3 + +L3: + %sum.2 = phi float [ %conv13, %L2 ], [ %sum.033, %for.body ] + %arrayidx14 = getelementptr inbounds i32, i32* %c, i64 %i + %2 = load i32, i32* %arrayidx14 + %conv15 = sitofp i32 %2 to float + %conv16 = fpext float %conv15 to double + %add17 = fadd double %conv16, 3.000000e+00 + %conv18 = fpext float %sum.2 to double + %mul19 = fmul double %add17, %conv18 + %conv20 = fptrunc double %mul19 to float + %inc = add nuw nsw i64 %i, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret float %conv20 +} + +!0 = distinct !{!0, !1, !2, !3, !4} +!1 = !{!"llvm.loop.vectorize.width", i32 1} +!2 = !{!"llvm.loop.interleave.count", i32 2} +!3 = !{!"llvm.loop.vectorize.enable", i1 true} +!4 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} Index: llvm/test/Transforms/LoopVectorize/remove-switches.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/remove-switches.ll @@ -0,0 +1,352 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -O3 -loop-vectorize -pass-remarks-analysis=loop-vectorize -S 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s -check-prefix=CHECK-REMARKS + +; We should not vectorize this loop since we do not have masked loads and stores +; CHECK-REMARKS: remark: :0:0: the cost-model indicates that vectorization is not beneficial +define void @switch_cost(i32* noalias %a, i32* noalias readonly %b, i32* noalias readonly %c, i64 %N) #0 { +; CHECK-LABEL: @switch_cost( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT: vector.body +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L4:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: switch i32 [[TMP0]], label [[FOR_BODY_SWITCH5:%.*]] [ +; CHECK-NEXT: i32 4, label [[FOR_BODY_L4_CRIT_EDGE:%.*]] +; CHECK-NEXT: i32 2, label [[FOR_BODY_L2_CRIT_EDGE:%.*]] +; CHECK-NEXT: i32 3, label [[L3:%.*]] +; CHECK-NEXT: ] + +entry: + br label %for.body + +for.body: + %i = phi i64 [ %inc, %L4 ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i + %0 = load i32, i32* %arrayidx + switch i32 %0, label %L1 [ + i32 4, label %L4 + i32 2, label %L2 + i32 3, label %L3 + ] + +L1: + %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i + %1 = load i32, i32* %arrayidx5 + %mul = mul nsw i32 %1, %0 + %add = add nsw i32 %mul, %0 + store i32 %add, i32* %arrayidx + br label %L2 + +L2: + %2 = phi i32 [ 2, %for.body ], [ %add, %L1 ] + %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i + %3 = load i32, i32* %arrayidx7 + %mul9 = mul nsw i32 %3, %3 + %add11 = add nsw i32 %2, %mul9 + store i32 %add11, i32* %arrayidx + br label %L3 + +L3: + %4 = phi i32 [ 3, %for.body ], [ %add11, %L2 ] + %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 %i + %5 = load i32, i32* %arrayidx13 + %mul14 = mul nsw i32 %5, %4 + %add16 = add nsw i32 %mul14, %4 + store i32 %add16, i32* %arrayidx + br label %L4 + +L4: + %6 = phi i32 [ 4, %for.body ], [ %add16, %L3 ] + %arrayidx17 = getelementptr inbounds i32, i32* %c, i64 %i + %7 = load i32, i32* %arrayidx17 + %mul19 = mul nsw i32 %7, %7 + %add21 = add nsw i32 %6, %mul19 + store i32 %add21, i32* %arrayidx + %inc = add nuw nsw i64 %i, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +define void @switch(i32* noalias %a, i32* noalias %b, i64 %N) { +; CHECK-LABEL: @switch( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP14:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP14]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER4:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[DOTOP:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> , <4 x i32> [[DOTOP]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD3]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER4]] +; CHECK: for.body.preheader4: +; CHECK-NEXT: [[I_015_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_015:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[I_015_PH]], [[FOR_BODY_PREHEADER4]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_015]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[SWITCH:%.*]] = icmp eq i32 [[TMP10]], 3 +; CHECK-NEXT: [[SWITCH1:%.*]] = icmp eq i32 [[TMP10]], 2 +; CHECK-NEXT: [[R_0_OP:%.*]] = select i1 [[SWITCH1]], i32 9, i32 16 +; CHECK-NEXT: [[ADD4:%.*]] = select i1 [[SWITCH]], i32 7, i32 [[R_0_OP]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_015]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP11]], [[ADD4]] +; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_015]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] + +entry: + %cmp14 = icmp sgt i64 %N, 0 + br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %L3 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %L3 + %i.015 = phi i64 [ %inc, %L3 ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i.015 + %0 = load i32, i32* %arrayidx + switch i32 %0, label %L1 [ + i32 3, label %L3 + i32 2, label %L2 + ] + +L1: ; preds = %for.body + br label %L2 + +L2: ; preds = %for.body, %L1 + %r.0 = phi i32 [ 12, %L1 ], [ 5, %for.body ] + br label %L3 + +L3: ; preds = %for.body, %L2 + %r.1 = phi i32 [ %r.0, %L2 ], [ 3, %for.body ] + %add4 = add nuw nsw i32 %r.1, 4 + %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i.015 + %1 = load i32, i32* %arrayidx5 + %mul = mul nsw i32 %1, %add4 + store i32 %mul, i32* %arrayidx5 + %inc = add nuw nsw i64 %i.015, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !0 +} + +define void @switch_VF1_UF2(i32* noalias %a, i32* noalias readonly %b, i32* noalias readonly %c, i64 %N) { +; CHECK-LABEL: @switch_VF1_UF2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -2 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[INDUCTION4:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDUCTION4]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4 +; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP2]], 3 +; CHECK-NEXT: [[DOTNOT8:%.*]] = icmp eq i32 [[TMP3]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP2]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP3]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = mul nsw i32 [[TMP2]], 3 +; CHECK-NEXT: [[TMP7:%.*]] = mul nsw i32 [[TMP3]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP4]], i32 2, i32 [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP5]], i32 2, i32 [[TMP7]] +; CHECK-NEXT: br i1 [[DOTNOT]], label [[PRED_LOAD_CONTINUE:%.*]], label [[PRED_LOAD_IF:%.*]] +; CHECK: pred.load.if: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK: pred.load.continue: +; CHECK-NEXT: [[TMP12:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: br i1 [[DOTNOT8]], label [[PRED_LOAD_CONTINUE6]], label [[PRED_LOAD_IF5:%.*]] +; CHECK: pred.load.if5: +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION4]] +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; CHECK: pred.load.continue6: +; CHECK-NEXT: [[TMP15:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP16:%.*]] = mul nsw i32 [[TMP12]], 3 +; CHECK-NEXT: [[TMP17:%.*]] = mul nsw i32 [[TMP15]], 3 +; CHECK-NEXT: [[TMP18:%.*]] = add nsw i32 [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP19:%.*]] = add nsw i32 [[TMP17]], [[TMP9]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[DOTNOT]], i32 3, i32 [[TMP18]] +; CHECK-NEXT: [[PREDPHI7:%.*]] = select i1 [[DOTNOT8]], i32 3, i32 [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDUCTION4]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP20]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP21]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = shl nsw i32 [[TMP22]], 2 +; CHECK-NEXT: [[TMP25:%.*]] = shl nsw i32 [[TMP23]], 2 +; CHECK-NEXT: [[TMP26:%.*]] = add nsw i32 [[TMP24]], [[PREDPHI]] +; CHECK-NEXT: [[TMP27:%.*]] = add nsw i32 [[TMP25]], [[PREDPHI7]] +; CHECK-NEXT: store i32 [[TMP26]], i32* [[TMP0]], align 4 +; CHECK-NEXT: store i32 [[TMP27]], i32* [[TMP1]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ {{.*}}, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] +; CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[SWITCH:%.*]] = icmp eq i32 [[TMP29]], 3 +; CHECK-NEXT: br i1 [[SWITCH]], label [[L3]], label [[FOR_BODY_SWITCH:%.*]] + +entry: + br label %for.body + +for.body: + %i = phi i64 [ %inc, %L3 ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i + %0 = load i32, i32* %arrayidx + %switch = icmp eq i32 %0, 3 + br i1 %switch, label %L3, label %for.body.switch + +for.body.switch: + %switch1 = icmp eq i32 %0, 2 + br i1 %switch1, label %L2, label %for.body.switch2 + +for.body.switch2: + %add = mul nsw i32 %0, 3 + store i32 %add, i32* %arrayidx + br label %L2 + +L2: + %1 = phi i32 [ %add, %for.body.switch2 ], [ %0, %for.body.switch ] + %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i + %2 = load i32, i32* %arrayidx5 + %mul6 = mul nsw i32 %2, 3 + %add8 = add nsw i32 %1, %mul6 + store i32 %add8, i32* %arrayidx + br label %L3 + +L3: + %3 = phi i32 [ %0, %for.body ], [ %add8, %L2 ] + %arrayidx9 = getelementptr inbounds i32, i32* %c, i64 %i + %4 = load i32, i32* %arrayidx9 + %mul10 = shl nsw i32 %4, 2 + %add12 = add nsw i32 %3, %mul10 + store i32 %add12, i32* %arrayidx + %inc = add nuw nsw i64 %i, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret void +} + +; This loop will not vectorize due to unsafe FP ops, ensure the switch statement is created again in for.body +define float @switch_no_vectorize(i32* noalias %a, i32* noalias readonly %b, i32* noalias readonly %c, float %val, i64 %N) { +; CHECK-LABEL: @switch_no_vectorize( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOT: vector.body: +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SUM_033:%.*]] = phi float [ [[CONV20:%.*]], [[L3]] ], [ 2.000000e+00, [[ENTRY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: switch i32 [[TMP0]], label [[FOR_BODY_SWITCH2:%.*]] [ +; CHECK-NEXT: i32 3, label [[L3]] +; CHECK-NEXT: i32 2, label [[L2:%.*]] +; CHECK-NEXT: ] + +entry: + br label %for.body + +for.body: + %i = phi i64 [ %inc, %L3 ], [ 0, %entry ] + %sum.033 = phi float [ %conv20, %L3 ], [ 2.000000e+00, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i + %0 = load i32, i32* %arrayidx + switch i32 %0, label %L1 [ + i32 3, label %L3 + i32 2, label %L2 + ] + +L1: + %conv = sitofp i32 %0 to float + %conv4 = fpext float %conv to double + %add = fadd double %conv4, 1.000000e+00 + %conv5 = fpext float %sum.033 to double + %mul = fmul double %add, %conv5 + %conv6 = fptrunc double %mul to float + br label %L2 + +L2: + %sum.1 = phi float [ %conv6, %L1 ], [ %sum.033, %for.body ] + %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i + %1 = load i32, i32* %arrayidx7 + %conv8 = sitofp i32 %1 to float + %conv9 = fpext float %conv8 to double + %add10 = fadd double %conv9, 2.000000e+00 + %conv11 = fpext float %sum.1 to double + %mul12 = fmul double %add10, %conv11 + %conv13 = fptrunc double %mul12 to float + br label %L3 + +L3: + %sum.2 = phi float [ %conv13, %L2 ], [ %sum.033, %for.body ] + %arrayidx14 = getelementptr inbounds i32, i32* %c, i64 %i + %2 = load i32, i32* %arrayidx14 + %conv15 = sitofp i32 %2 to float + %conv16 = fpext float %conv15 to double + %add17 = fadd double %conv16, 3.000000e+00 + %conv18 = fpext float %sum.2 to double + %mul19 = fmul double %add17, %conv18 + %conv20 = fptrunc double %mul19 to float + %inc = add nuw nsw i64 %i, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret float %conv20 +} + +!0 = distinct !{!0, !2, !4, !6} +!1 = distinct !{!1, !3, !5, !6} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.width", i32 1} +!4 = !{!"llvm.loop.interleave.count", i32 1} +!5 = !{!"llvm.loop.interleave.count", i32 2} +!6 = !{!"llvm.loop.vectorize.enable", i1 true} Index: llvm/test/Transforms/SimplifyCFG/nomerge.ll =================================================================== --- llvm/test/Transforms/SimplifyCFG/nomerge.ll +++ llvm/test/Transforms/SimplifyCFG/nomerge.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -O1 -S | FileCheck %s +; RUN: opt < %s -O1 -remove-switch-blocks=false -S | FileCheck %s ; The attribute nomerge prevents the 3 bar() calls from being sunk/hoisted into ; one inside a function. Check that there are still 3 tail calls. Index: llvm/test/Transforms/SimplifyCFG/remove-switches.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SimplifyCFG/remove-switches.ll @@ -0,0 +1,142 @@ +; RUN: opt < %s -simplifycfg -switch-removal-threshold=4 -S | FileCheck %s + +define void @unswitch(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i64 %N){ +; CHECK-LABEL: @unswitch( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L4:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[SWITCH:%.*]] = icmp eq i32 [[TMP0]], 4 +; CHECK-NEXT: br i1 [[SWITCH]], label [[L4]], label [[FOR_BODY_SWITCH:%.*]], !prof !0 +; CHECK: for.body.switch: +; CHECK-NEXT: [[SWITCH1:%.*]] = icmp eq i32 [[TMP0]], 2 +; CHECK-NEXT: br i1 [[SWITCH1]], label [[L2:%.*]], label [[FOR_BODY_SWITCH2:%.*]], !prof !1 +; CHECK: for.body.switch2: +; CHECK-NEXT: [[SWITCH3:%.*]] = icmp eq i32 [[TMP0]], 3 +; CHECK-NEXT: br i1 [[SWITCH3]], label [[L3:%.*]], label [[FOR_BODY_SWITCH4:%.*]], !prof !2 +; CHECK: for.body.switch4: +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[I]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[MUL]], [[TMP0]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: br label [[L2]] +entry: + br label %for.body + +for.body: + %i = phi i64 [ %inc, %L4 ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i + %0 = load i32, i32* %arrayidx + switch i32 %0, label %L1 [ + i32 4, label %L4 + i32 2, label %L2 + i32 3, label %L3 + ], !prof !0 + +L1: + %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i + %1 = load i32, i32* %arrayidx5 + %mul = mul nsw i32 %1, %0 + %add = add nsw i32 %mul, %0 + store i32 %add, i32* %arrayidx + br label %L2 + +L2: + %2 = phi i32 [ %0, %for.body ], [ %add, %L1 ] + %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i + %3 = load i32, i32* %arrayidx7, align 4 + %mul9 = mul nsw i32 %3, %3 + %add11 = add nsw i32 %2, %mul9 + store i32 %add11, i32* %arrayidx + br label %L3 + +L3: + %4 = phi i32 [ %0, %for.body ], [ %add11, %L2 ] + %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 %i + %5 = load i32, i32* %arrayidx13 + %mul14 = mul nsw i32 %5, %4 + %add16 = add nsw i32 %mul14, %4 + store i32 %add16, i32* %arrayidx + br label %L4 + +L4: + %6 = phi i32 [ %0, %for.body ], [ %add16, %L3 ] + %arrayidx17 = getelementptr inbounds i32, i32* %c, i64 %i + %7 = load i32, i32* %arrayidx17 + %mul19 = mul nsw i32 %7, %7 + %add21 = add nsw i32 %6, %mul19 + store i32 %add21, i32* %arrayidx + %inc = add nuw nsw i64 %i, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +; This test should not replace the switch statement as multiple cases have the same destination block +define dso_local void @switch2(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i64 %N) { +; CHECK-LABEL: @switch2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: switch i32 [[TMP0]], label [[L1:%.*]] [ +; CHECK-NEXT: i32 4, label [[L3]] +; CHECK-NEXT: i32 2, label [[L2:%.*]] +; CHECK-NEXT: i32 3, label [[L3]] +; CHECK-NEXT: ] +entry: + br label %for.body + +for.body: + %i = phi i64 [ %inc, %L3 ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i + %0 = load i32, i32* %arrayidx + switch i32 %0, label %L1 [ + i32 4, label %L3 + i32 2, label %L2 + i32 3, label %L3 + ] + +L1: + %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i + %1 = load i32, i32* %arrayidx5 + %mul = mul nsw i32 %1, %0 + %add = add nsw i32 %mul, %0 + store i32 %add, i32* %arrayidx + br label %L2 + +L2: + %2 = phi i32 [ %0, %for.body ], [ %add, %L1 ] + %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i + %3 = load i32, i32* %arrayidx7 + %mul9 = mul nsw i32 %3, %3 + %add11 = add nsw i32 %2, %mul9 + store i32 %add11, i32* %arrayidx + br label %L3 + +L3: + %4 = phi i32 [ %0, %for.body ], [ %0, %for.body ], [ %add11, %L2 ] + %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 %i + %5 = load i32, i32* %arrayidx13 + %mul14 = mul nsw i32 %5, %4 + %add16 = add nsw i32 %mul14, %4 + store i32 %add16, i32* %arrayidx + %inc = add nuw nsw i64 %i, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +!0 = !{!"branch_weights", i32 15, i32 5, i32 10, i32 2} +; CHECK: !0 = !{!"branch_weights", i32 5, i32 27} +; CHECK: !1 = !{!"branch_weights", i32 10, i32 17} +; CHECK: !2 = !{!"branch_weights", i32 2, i32 15}