Index: clang/test/Frontend/optimization-remark-analysis.c
===================================================================
--- clang/test/Frontend/optimization-remark-analysis.c
+++ clang/test/Frontend/optimization-remark-analysis.c
@@ -1,5 +1,5 @@
-// RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -emit-llvm -Rpass-analysis -S %s -o - 2>&1 | FileCheck %s --check-prefix=RPASS
-// RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -emit-llvm -S %s -o - 2>&1 | FileCheck %s
+// RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -mllvm -remove-switch-blocks=false -emit-llvm -Rpass-analysis -S %s -o - 2>&1 | FileCheck %s --check-prefix=RPASS
+// RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -mllvm -remove-switch-blocks=false -emit-llvm -S %s -o - 2>&1 | FileCheck %s
 
 // RPASS: {{.*}}:7:8: remark: loop not vectorized: loop contains a switch statement
 // CHECK-NOT: {{.*}}:7:8: remark: loop not vectorized: loop contains a switch statement
Index: llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
===================================================================
--- llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
+++ llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
@@ -29,6 +29,7 @@
   bool SinkCommonInsts = false;
   bool SimplifyCondBranch = true;
   bool FoldTwoEntryPHINode = true;
+  unsigned SwitchRemovalThreshold = 0;
 
   AssumptionCache *AC = nullptr;
 
@@ -70,6 +71,11 @@
     FoldTwoEntryPHINode = B;
     return *this;
   }
+
+  SimplifyCFGOptions &switchRemovalThreshold(int I) {
+    SwitchRemovalThreshold = I;
+    return *this;
+  }
 };
 
 } // namespace llvm
Index: llvm/lib/Passes/PassBuilder.cpp
===================================================================
--- llvm/lib/Passes/PassBuilder.cpp
+++ llvm/lib/Passes/PassBuilder.cpp
@@ -256,6 +256,17 @@
     cl::desc("Run synthetic function entry count generation "
              "pass"));
 
+static cl::opt<bool>
+    RemoveSwitchBlocks("remove-switch-blocks", cl::init(true), cl::Hidden,
+                       cl::desc("Convert switch blocks into a branch sequence "
+                                "prior to vectorization."));
+
+// This value determines the point at which we stop removing switch statements
+// before the vectorizer pass. Removing switch blocks and replacing them with
+// compares and branches allows architectures that support predication to
+// vectorize.
+static const int RemoveSwitchCaseThreshold = 4;
+
 static const Regex DefaultAliasRegex(
     "^(default|thinlto-pre-link|thinlto|lto-pre-link|lto)<(O[0123sz])>$");
 
@@ -1201,6 +1212,10 @@
 /// TODO: Should LTO cause any differences to this set of passes?
 void PassBuilder::addVectorPasses(OptimizationLevel Level,
                                   FunctionPassManager &FPM, bool IsFullLTO) {
+  if (RemoveSwitchBlocks)
+    FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().switchRemovalThreshold(
+        RemoveSwitchCaseThreshold)));
+
   FPM.addPass(LoopVectorizePass(
       LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
 
Index: llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -55,6 +55,11 @@
     "bonus-inst-threshold", cl::Hidden, cl::init(1),
     cl::desc("Control the number of bonus instructions (default = 1)"));
 
+static cl::opt<unsigned> UserSwitchRemovalThreshold(
+    "switch-removal-threshold", cl::Hidden, cl::init(0),
+    cl::desc("Set the threshold for the number of switch cases where we"
+             "convert switch blocks to branches and compares"));
+
 static cl::opt<bool> UserKeepLoops(
     "keep-loops", cl::Hidden, cl::init(true),
     cl::desc("Preserve canonical loop structure (default = true)"));
@@ -308,6 +313,8 @@
     Options.HoistCommonInsts = UserHoistCommonInsts;
   if (UserSinkCommonInsts.getNumOccurrences())
     Options.SinkCommonInsts = UserSinkCommonInsts;
+  if (UserSwitchRemovalThreshold.getNumOccurrences())
+    Options.SwitchRemovalThreshold = UserSwitchRemovalThreshold;
 }
 
 SimplifyCFGPass::SimplifyCFGPass() : Options() {
Index: llvm/lib/Transforms/Utils/SimplifyCFG.cpp
===================================================================
--- llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -6141,6 +6141,102 @@
   return true;
 }
 
+// Attempt to turn a switch statement into a series of conditional branches
+// which we may later be able to vectorize.
+static bool TurnSmallSwitchIntoICmps(SwitchInst *SI, IRBuilder<> &Builder) {
+  assert(SI->getNumCases() > 1 && "Degenerate switch?");
+
+  // Check to see if we have a genuine default, reachable block with executable
+  // instructions in them.
+  bool HasDefault =
+      !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
+
+  BasicBlock *DefaultBlock = HasDefault ? SI->getDefaultDest() : nullptr;
+  BasicBlock *BB = SI->getParent();
+
+  // Make sure each of the cases has a unique destination
+  for (auto Case : SI->cases())
+    if (!SI->findCaseDest(Case.getCaseSuccessor()))
+      return false;
+
+  // Record the total weighting for this switch block.
+  uint64_t TotalWeight = 0;
+  SmallVector<uint64_t, 8> Weights;
+  if (HasBranchWeights(SI)) {
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == (SI->getNumCases() + 1))
+      for (auto W : Weights)
+        TotalWeight += W;
+  }
+
+  BasicBlock *FalseDest = nullptr;
+  uint64_t FalseWeight = TotalWeight;
+  for (auto CI : SI->cases()) {
+    BasicBlock *TrueDest = CI.getCaseSuccessor();
+    Value *Cmp =
+        Builder.CreateICmpEQ(SI->getCondition(), CI.getCaseValue(), "switch");
+
+    // Walk through PHIs in TrueDest and see which ones came
+    // from the switch block, then remap them.
+    if (FalseDest) {
+      for (PHINode &PN : TrueDest->phis()) {
+        for (auto PB : PN.blocks()) {
+          if (PB == BB) {
+            Value *V = PN.getIncomingValueForBlock(BB);
+            PN.removeIncomingValue(BB, false);
+            PN.addIncoming(V, FalseDest);
+          }
+        }
+      }
+    }
+
+    BasicBlock *MoveAfter = FalseDest ? FalseDest : BB;
+    FalseDest = BasicBlock::Create(BB->getContext(), BB->getName() + ".switch",
+                                 BB->getParent(), BB);
+    FalseDest->moveAfter(MoveAfter);
+
+    Instruction *I = Builder.CreateCondBr(Cmp, TrueDest, FalseDest);
+    // Update weight for the newly-created conditional branch.
+    // We set the weight of the TrueDest to the weight for the successor
+    // of the current case. The FalseDest is assigned the remaining total
+    // weight, minus the weight assigned to TrueDest.
+    if (TotalWeight) {
+      int Index = CI.getSuccessorIndex();
+      FalseWeight -= Weights[Index];
+      setBranchWeights(I, Weights[Index], FalseWeight);
+    }
+    Builder.SetInsertPoint(FalseDest);
+  }
+
+  if (DefaultBlock) {
+    Builder.CreateBr(DefaultBlock);
+
+    // The block that we jump to may have had some PHIs that came
+    // from the block containing the switch statement. Now that we
+    // are removing the switch statement we need to fix up the PHIs.
+
+    // Walk through PHIs in DefaultBlock and see which ones came
+    // from the switch block, then remap them.
+    for (PHINode &PN : DefaultBlock->phis()) {
+      for (auto PB : PN.blocks()) {
+        if (PB == BB) {
+          Value *V = PN.getIncomingValueForBlock(BB);
+          PN.removeIncomingValue(BB, false);
+          PN.addIncoming(V, FalseDest);
+        }
+      }
+    }
+  } else
+    Builder.CreateUnreachable();
+
+  // Drop the switch.
+  SI->eraseFromParent();
+
+  Builder.SetInsertPoint(BB);
+
+  return true;
+}
+
 bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   BasicBlock *BB = SI->getParent();
 
@@ -6163,8 +6259,14 @@
         return requestResimplify();
   }
 
+  unsigned NumCases = SI->getNumCases();
+  bool RemoveSwitches = Options.SwitchRemovalThreshold >= NumCases;
+
+  if (RemoveSwitches && TurnSmallSwitchIntoICmps(SI, Builder))
+    return simplifyCFG(BB, TTI, DTU, Options) | true;
+
   // Try to transform the switch into an icmp and a branch.
-  if (TurnSwitchRangeIntoICmp(SI, Builder))
+  if (!RemoveSwitches && TurnSwitchRangeIntoICmp(SI, Builder))
     return requestResimplify();
 
   // Remove unreachable cases.
@@ -6412,16 +6514,18 @@
       if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
         return requestResimplify();
 
-    // This block must be empty, except for the setcond inst, if it exists.
-    // Ignore dbg and pseudo intrinsics.
-    auto I = BB->instructionsWithoutDebug(true).begin();
-    if (&*I == BI) {
-      if (FoldValueComparisonIntoPredecessors(BI, Builder))
-        return requestResimplify();
-    } else if (&*I == cast<Instruction>(BI->getCondition())) {
-      ++I;
-      if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
-        return requestResimplify();
+    if (Options.SwitchRemovalThreshold == 0) {
+      // This block must be empty, except for the setcond inst, if it exists.
+      // Ignore dbg and pseudo intrinsics.
+      auto I = BB->instructionsWithoutDebug(true).begin();
+      if (&*I == BI) {
+        if (FoldValueComparisonIntoPredecessors(BI, Builder))
+          return requestResimplify();
+      } else if (&*I == cast<Instruction>(BI->getCondition())) {
+        ++I;
+        if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
+          return requestResimplify();
+      }
     }
   }
 
Index: llvm/test/Other/new-pm-defaults.ll
===================================================================
--- llvm/test/Other/new-pm-defaults.ll
+++ llvm/test/Other/new-pm-defaults.ll
@@ -216,6 +216,7 @@
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
+; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis
 ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis
Index: llvm/test/Other/new-pm-lto-defaults.ll
===================================================================
--- llvm/test/Other/new-pm-lto-defaults.ll
+++ llvm/test/Other/new-pm-lto-defaults.ll
@@ -105,6 +105,7 @@
 ; CHECK-O23SZ-NEXT: Running pass: LoopDeletionPass on Loop
 ; CHECK-O23SZ-NEXT: Running pass: LoopFullUnrollPass on Loop
 ; CHECK-O23SZ-NEXT: Running pass: LoopDistributePass on foo
+; CHECK-O23SZ-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O23SZ-NEXT: Running pass: LoopVectorizePass on foo
 ; CHECK-O23SZ-NEXT: Running analysis: BlockFrequencyAnalysis on foo
 ; CHECK-O23SZ-NEXT: Running analysis: BranchProbabilityAnalysis on foo
Index: llvm/test/Other/new-pm-thinlto-defaults.ll
===================================================================
--- llvm/test/Other/new-pm-thinlto-defaults.ll
+++ llvm/test/Other/new-pm-thinlto-defaults.ll
@@ -197,6 +197,7 @@
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InjectTLIMappings
+; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis
 ; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis
Index: llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
===================================================================
--- llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -168,6 +168,7 @@
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
+; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
Index: llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
===================================================================
--- llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -180,6 +180,7 @@
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
+; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-remove-switches.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-remove-switches.ll
@@ -0,0 +1,277 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve -scalable-vectorization=on -S | FileCheck %s
+
+define void @switch(i32* noalias %a, i32* noalias %b, i32* noalias %c, i64 %N) #0 {
+; CHECK-LABEL: @switch(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 4, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <vscale x 4 x i1> [[TMP9]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 false, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i1> [[TMP10]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 false, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[TMP17]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 false, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> [[TMP9]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP20]], i32 4, <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD6]]
+; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP21:%.*]] = mul nsw <vscale x 4 x i32> [[PREDPHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP22:%.*]] = add nsw <vscale x 4 x i32> [[TMP21]], [[PREDPHI7]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <vscale x 4 x i1> [[TMP19]], [[TMP15]]
+; CHECK-NEXT:    [[TMP24:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 false, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or <vscale x 4 x i1> [[TMP24]], [[TMP23]]
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP25]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP27]], i32 4, <vscale x 4 x i1> [[TMP26]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP28:%.*]] = mul nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD9]], [[PREDPHI8]]
+; CHECK-NEXT:    [[TMP29:%.*]] = add nsw <vscale x 4 x i32> [[TMP28]], [[PREDPHI8]]
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast i32* [[TMP25]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP30]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[PREDPHI11:%.*]] = select <vscale x 4 x i1> [[TMP26]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD9]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD10]]
+; CHECK-NEXT:    [[PREDPHI12:%.*]] = select <vscale x 4 x i1> [[TMP26]], <vscale x 4 x i32> [[TMP29]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 4, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP31:%.*]] = mul nsw <vscale x 4 x i32> [[PREDPHI11]], [[PREDPHI11]]
+; CHECK-NEXT:    [[TMP32:%.*]] = add nsw <vscale x 4 x i32> [[TMP31]], [[PREDPHI12]]
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP6]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP32]], <vscale x 4 x i32>* [[TMP33]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], {{.*}}
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}}
+; CHECK-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L4:%.*]] ], [ {{.*}}, %for.body.preheader ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP35:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    switch i32 [[TMP35]], label [[FOR_BODY_SWITCH5:%.*]] [
+; CHECK-NEXT:    i32 4, label [[FOR_BODY_L4_CRIT_EDGE:%.*]]
+; CHECK-NEXT:    i32 2, label [[FOR_BODY_L2_CRIT_EDGE:%.*]]
+; CHECK-NEXT:    i32 3, label [[L3:%.*]]
+; CHECK-NEXT:    ]
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L4 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 4, label %L4
+  i32 2, label %L2
+  i32 3, label %L3
+  ]
+
+L1:
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx5
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %0
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %2 = phi i32 [ 2, %for.body ], [ %add, %L1 ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %3 = load i32, i32* %arrayidx7
+  %mul9 = mul nsw i32 %3, %3
+  %add11 = add nsw i32 %2, %mul9
+  store i32 %add11, i32* %arrayidx
+  br label %L3
+
+L3:
+  %4 = phi i32 [ 3, %for.body ], [ %add11, %L2 ]
+  %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 %i
+  %5 = load i32, i32* %arrayidx13
+  %mul14 = mul nsw i32 %5, %4
+  %add16 = add nsw i32 %mul14, %4
+  store i32 %add16, i32* %arrayidx
+  br label %L4
+
+L4:
+  %6 = phi i32 [ 4, %for.body ], [ %add16, %L3 ]
+  %arrayidx17 = getelementptr inbounds i32, i32* %c, i64 %i
+  %7 = load i32, i32* %arrayidx17
+  %mul19 = mul nsw i32 %7, %7
+  %add21 = add nsw i32 %6, %mul19
+  store i32 %add21, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @switch_VF1_UF2(i32* noalias %a, i32* noalias %b, i32* noalias %c, i64 %N) #0 {
+; CHECK-LABEL: @switch_VF1_UF2(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
+; CHECK-NEXT:    [[INDUCTION4:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i32> [[TMP2]], <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i32> [[TMP2]], <i32 2, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <2 x i32> [[TMP2]], <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> <i32 2, i32 2>, <2 x i32> [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_LOAD_CONTINUE:%.*]], label [[PRED_LOAD_IF:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, %vector.body ], [ [[TMP9]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_LOAD_CONTINUE6]], label [[PRED_LOAD_IF5:%.*]]
+; CHECK:       pred.load.if5:
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.continue6:
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <2 x i32> [[TMP16]], <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <2 x i32> [[TMP17]], [[TMP6]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> <i32 3, i32 3>, <2 x i32> [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP22:%.*]] = load <2 x i32>, <2 x i32>* [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = shl nsw <2 x i32> [[TMP22]], <i32 2, i32 2>
+; CHECK-NEXT:    [[TMP24:%.*]] = add nsw <2 x i32> [[TMP23]], [[TMP19]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP24]], <2 x i32>* [[TMP25]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}}
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ {{.*}}, %for.body.preheader ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp eq i32 [[TMP27]], 3
+; CHECK-NEXT:    br i1 [[SWITCH]], label [[L3]], label [[FOR_BODY_SWITCH:%.*]]
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L3 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  %switch = icmp eq i32 %0, 3
+  br i1 %switch, label %L3, label %for.body.switch
+
+for.body.switch:
+  %switch1 = icmp eq i32 %0, 2
+  br i1 %switch1, label %L2, label %for.body.switch2
+
+for.body.switch2:
+  %add = mul nsw i32 %0, 3
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %1 = phi i32 [ %add, %for.body.switch2 ], [ %0, %for.body.switch ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %2 = load i32, i32* %arrayidx5
+  %mul6 = mul nsw i32 %2, 3
+  %add8 = add nsw i32 %1, %mul6
+  store i32 %add8, i32* %arrayidx
+  br label %L3
+
+L3:
+  %3 = phi i32 [ %0, %for.body ], [ %add8, %L2 ]
+  %arrayidx9 = getelementptr inbounds i32, i32* %c, i64 %i
+  %4 = load i32, i32* %arrayidx9
+  %mul10 = shl nsw i32 %4, 2
+  %add12 = add nsw i32 %3, %mul10
+  store i32 %add12, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+; This loop will not vectorize due to unsafe FP ops, ensure the switch statement is created again in for.body
+define float @switch_no_vectorize(i32* noalias %a, i32* noalias %b, i32* noalias %c, float %val, i64 %N) {
+; CHECK-LABEL: @switch_no_vectorize(
+; CHECK-NOT:   vector.body
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SUM_033:%.*]] = phi float [ [[CONV20:%.*]], [[L3]] ], [ 2.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    switch i32 [[TMP0]], label [[FOR_BODY_SWITCH2:%.*]] [
+; CHECK-NEXT:    i32 3, label [[L3]]
+; CHECK-NEXT:    i32 2, label [[L2:%.*]]
+; CHECK-NEXT:    ]
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L3 ], [ 0, %entry ]
+  %sum.033 = phi float [ %conv20, %L3 ], [ 2.000000e+00, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 3, label %L3
+  i32 2, label %L2
+  ]
+
+L1:
+  %conv = sitofp i32 %0 to float
+  %conv4 = fpext float %conv to double
+  %add = fadd double %conv4, 1.000000e+00
+  %conv5 = fpext float %sum.033 to double
+  %mul = fmul double %add, %conv5
+  %conv6 = fptrunc double %mul to float
+  br label %L2
+
+L2:
+  %sum.1 = phi float [ %conv6, %L1 ], [ %sum.033, %for.body ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx7
+  %conv8 = sitofp i32 %1 to float
+  %conv9 = fpext float %conv8 to double
+  %add10 = fadd double %conv9, 2.000000e+00
+  %conv11 = fpext float %sum.1 to double
+  %mul12 = fmul double %add10, %conv11
+  %conv13 = fptrunc double %mul12 to float
+  br label %L3
+
+L3:
+  %sum.2 = phi float [ %conv13, %L2 ], [ %sum.033, %for.body ]
+  %arrayidx14 = getelementptr inbounds i32, i32* %c, i64 %i
+  %2 = load i32, i32* %arrayidx14
+  %conv15 = sitofp i32 %2 to float
+  %conv16 = fpext float %conv15 to double
+  %add17 = fadd double %conv16, 3.000000e+00
+  %conv18 = fpext float %sum.2 to double
+  %mul19 = fmul double %add17, %conv18
+  %conv20 = fptrunc double %mul19 to float
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret float %conv20
+}
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.vectorize.width", i32 1}
+!2 = !{!"llvm.loop.interleave.count", i32 2}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+!4 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
Index: llvm/test/Transforms/LoopVectorize/remove-switches.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/remove-switches.ll
@@ -0,0 +1,352 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -loop-vectorize -pass-remarks-analysis=loop-vectorize -S 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck %s -check-prefix=CHECK-REMARKS
+
+; We should not vectorize this loop since we do not have masked loads and stores
+; CHECK-REMARKS: remark: <unknown>:0:0: the cost-model indicates that vectorization is not beneficial
+define void @switch_cost(i32* noalias %a, i32* noalias readonly %b, i32* noalias readonly %c, i64 %N) #0 {
+; CHECK-LABEL: @switch_cost(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-NOT:   vector.body
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L4:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    switch i32 [[TMP0]], label [[FOR_BODY_SWITCH5:%.*]] [
+; CHECK-NEXT:    i32 4, label [[FOR_BODY_L4_CRIT_EDGE:%.*]]
+; CHECK-NEXT:    i32 2, label [[FOR_BODY_L2_CRIT_EDGE:%.*]]
+; CHECK-NEXT:    i32 3, label [[L3:%.*]]
+; CHECK-NEXT:    ]
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L4 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 4, label %L4
+  i32 2, label %L2
+  i32 3, label %L3
+  ]
+
+L1:
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx5
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %0
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %2 = phi i32 [ 2, %for.body ], [ %add, %L1 ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %3 = load i32, i32* %arrayidx7
+  %mul9 = mul nsw i32 %3, %3
+  %add11 = add nsw i32 %2, %mul9
+  store i32 %add11, i32* %arrayidx
+  br label %L3
+
+L3:
+  %4 = phi i32 [ 3, %for.body ], [ %add11, %L2 ]
+  %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 %i
+  %5 = load i32, i32* %arrayidx13
+  %mul14 = mul nsw i32 %5, %4
+  %add16 = add nsw i32 %mul14, %4
+  store i32 %add16, i32* %arrayidx
+  br label %L4
+
+L4:
+  %6 = phi i32 [ 4, %for.body ], [ %add16, %L3 ]
+  %arrayidx17 = getelementptr inbounds i32, i32* %c, i64 %i
+  %7 = load i32, i32* %arrayidx17
+  %mul19 = mul nsw i32 %7, %7
+  %add21 = add nsw i32 %6, %mul19
+  store i32 %add21, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @switch(i32* noalias %a, i32* noalias %b, i64 %N) {
+; CHECK-LABEL: @switch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP14:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP14]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER4:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[DOTOP:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> <i32 9, i32 9, i32 9, i32 9>, <4 x i32> <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32> [[DOTOP]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER4]]
+; CHECK:       for.body.preheader4:
+; CHECK-NEXT:    [[I_015_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[I_015_PH]], [[FOR_BODY_PREHEADER4]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_015]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp eq i32 [[TMP10]], 3
+; CHECK-NEXT:    [[SWITCH1:%.*]] = icmp eq i32 [[TMP10]], 2
+; CHECK-NEXT:    [[R_0_OP:%.*]] = select i1 [[SWITCH1]], i32 9, i32 16
+; CHECK-NEXT:    [[ADD4:%.*]] = select i1 [[SWITCH]], i32 7, i32 [[R_0_OP]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_015]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], [[ADD4]]
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_015]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+
+entry:
+  %cmp14 = icmp sgt i64 %N, 0
+  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %L3
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %L3
+  %i.015 = phi i64 [ %inc, %L3 ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i.015
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 3, label %L3
+  i32 2, label %L2
+  ]
+
+L1:                                               ; preds = %for.body
+  br label %L2
+
+L2:                                               ; preds = %for.body, %L1
+  %r.0 = phi i32 [ 12, %L1 ], [ 5, %for.body ]
+  br label %L3
+
+L3:                                               ; preds = %for.body, %L2
+  %r.1 = phi i32 [ %r.0, %L2 ], [ 3, %for.body ]
+  %add4 = add nuw nsw i32 %r.1, 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i.015
+  %1 = load i32, i32* %arrayidx5
+  %mul = mul nsw i32 %1, %add4
+  store i32 %mul, i32* %arrayidx5
+  %inc = add nuw nsw i64 %i.015, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !0
+}
+
+define void @switch_VF1_UF2(i32* noalias %a, i32* noalias readonly %b, i32* noalias readonly %c, i64 %N) {
+; CHECK-LABEL: @switch_VF1_UF2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
+; CHECK-NEXT:    [[INDUCTION4:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDUCTION4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP2]], 3
+; CHECK-NEXT:    [[DOTNOT8:%.*]] = icmp eq i32 [[TMP3]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw i32 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw i32 [[TMP3]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP4]], i32 2, i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP5]], i32 2, i32 [[TMP7]]
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[PRED_LOAD_CONTINUE:%.*]], label [[PRED_LOAD_IF:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    br i1 [[DOTNOT8]], label [[PRED_LOAD_CONTINUE6]], label [[PRED_LOAD_IF5:%.*]]
+; CHECK:       pred.load.if5:
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.continue6:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw i32 [[TMP12]], 3
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw i32 [[TMP15]], 3
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw i32 [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP19:%.*]] = add nsw i32 [[TMP17]], [[TMP9]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[DOTNOT]], i32 3, i32 [[TMP18]]
+; CHECK-NEXT:    [[PREDPHI7:%.*]] = select i1 [[DOTNOT8]], i32 3, i32 [[TMP19]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDUCTION4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP20]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = shl nsw i32 [[TMP22]], 2
+; CHECK-NEXT:    [[TMP25:%.*]] = shl nsw i32 [[TMP23]], 2
+; CHECK-NEXT:    [[TMP26:%.*]] = add nsw i32 [[TMP24]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP27:%.*]] = add nsw i32 [[TMP25]], [[PREDPHI7]]
+; CHECK-NEXT:    store i32 [[TMP26]], i32* [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP27]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ {{.*}}, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp eq i32 [[TMP29]], 3
+; CHECK-NEXT:    br i1 [[SWITCH]], label [[L3]], label [[FOR_BODY_SWITCH:%.*]]
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L3 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  %switch = icmp eq i32 %0, 3
+  br i1 %switch, label %L3, label %for.body.switch
+
+for.body.switch:
+  %switch1 = icmp eq i32 %0, 2
+  br i1 %switch1, label %L2, label %for.body.switch2
+
+for.body.switch2:
+  %add = mul nsw i32 %0, 3
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %1 = phi i32 [ %add, %for.body.switch2 ], [ %0, %for.body.switch ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %2 = load i32, i32* %arrayidx5
+  %mul6 = mul nsw i32 %2, 3
+  %add8 = add nsw i32 %1, %mul6
+  store i32 %add8, i32* %arrayidx
+  br label %L3
+
+L3:
+  %3 = phi i32 [ %0, %for.body ], [ %add8, %L2 ]
+  %arrayidx9 = getelementptr inbounds i32, i32* %c, i64 %i
+  %4 = load i32, i32* %arrayidx9
+  %mul10 = shl nsw i32 %4, 2
+  %add12 = add nsw i32 %3, %mul10
+  store i32 %add12, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+; This loop will not vectorize due to unsafe FP ops, ensure the switch statement is created again in for.body
+define float @switch_no_vectorize(i32* noalias %a, i32* noalias readonly %b, i32* noalias readonly %c, float %val, i64 %N) {
+; CHECK-LABEL: @switch_no_vectorize(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-NOT:   vector.body:
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SUM_033:%.*]] = phi float [ [[CONV20:%.*]], [[L3]] ], [ 2.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    switch i32 [[TMP0]], label [[FOR_BODY_SWITCH2:%.*]] [
+; CHECK-NEXT:    i32 3, label [[L3]]
+; CHECK-NEXT:    i32 2, label [[L2:%.*]]
+; CHECK-NEXT:    ]
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L3 ], [ 0, %entry ]
+  %sum.033 = phi float [ %conv20, %L3 ], [ 2.000000e+00, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 3, label %L3
+  i32 2, label %L2
+  ]
+
+L1:
+  %conv = sitofp i32 %0 to float
+  %conv4 = fpext float %conv to double
+  %add = fadd double %conv4, 1.000000e+00
+  %conv5 = fpext float %sum.033 to double
+  %mul = fmul double %add, %conv5
+  %conv6 = fptrunc double %mul to float
+  br label %L2
+
+L2:
+  %sum.1 = phi float [ %conv6, %L1 ], [ %sum.033, %for.body ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx7
+  %conv8 = sitofp i32 %1 to float
+  %conv9 = fpext float %conv8 to double
+  %add10 = fadd double %conv9, 2.000000e+00
+  %conv11 = fpext float %sum.1 to double
+  %mul12 = fmul double %add10, %conv11
+  %conv13 = fptrunc double %mul12 to float
+  br label %L3
+
+L3:
+  %sum.2 = phi float [ %conv13, %L2 ], [ %sum.033, %for.body ]
+  %arrayidx14 = getelementptr inbounds i32, i32* %c, i64 %i
+  %2 = load i32, i32* %arrayidx14
+  %conv15 = sitofp i32 %2 to float
+  %conv16 = fpext float %conv15 to double
+  %add17 = fadd double %conv16, 3.000000e+00
+  %conv18 = fpext float %sum.2 to double
+  %mul19 = fmul double %add17, %conv18
+  %conv20 = fptrunc double %mul19 to float
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret float %conv20
+}
+
+!0 = distinct !{!0, !2, !4, !6}
+!1 = distinct !{!1, !3, !5, !6}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.width", i32 1}
+!4 = !{!"llvm.loop.interleave.count", i32 1}
+!5 = !{!"llvm.loop.interleave.count", i32 2}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}
Index: llvm/test/Transforms/SimplifyCFG/nomerge.ll
===================================================================
--- llvm/test/Transforms/SimplifyCFG/nomerge.ll
+++ llvm/test/Transforms/SimplifyCFG/nomerge.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -O1 -S | FileCheck %s
+; RUN: opt < %s -O1 -remove-switch-blocks=false -S | FileCheck %s
 
 ; The attribute nomerge prevents the 3 bar() calls from being sunk/hoisted into
 ; one inside a function. Check that there are still 3 tail calls.
Index: llvm/test/Transforms/SimplifyCFG/remove-switches.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/SimplifyCFG/remove-switches.ll
@@ -0,0 +1,142 @@
+; RUN: opt < %s -simplifycfg -switch-removal-threshold=4 -S | FileCheck %s
+
+define void @unswitch(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i64 %N){
+; CHECK-LABEL: @unswitch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L4:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp eq i32 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[SWITCH]], label [[L4]], label [[FOR_BODY_SWITCH:%.*]], !prof !0
+; CHECK:       for.body.switch:
+; CHECK-NEXT:    [[SWITCH1:%.*]] = icmp eq i32 [[TMP0]], 2
+; CHECK-NEXT:    br i1 [[SWITCH1]], label [[L2:%.*]], label [[FOR_BODY_SWITCH2:%.*]], !prof !1
+; CHECK:       for.body.switch2:
+; CHECK-NEXT:    [[SWITCH3:%.*]] = icmp eq i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[SWITCH3]], label [[L3:%.*]], label [[FOR_BODY_SWITCH4:%.*]], !prof !2
+; CHECK:       for.body.switch4:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[MUL]], [[TMP0]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    br label [[L2]]
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L4 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 4, label %L4
+  i32 2, label %L2
+  i32 3, label %L3
+  ], !prof !0
+
+L1:
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx5
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %0
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %2 = phi i32 [ %0, %for.body ], [ %add, %L1 ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %3 = load i32, i32* %arrayidx7, align 4
+  %mul9 = mul nsw i32 %3, %3
+  %add11 = add nsw i32 %2, %mul9
+  store i32 %add11, i32* %arrayidx
+  br label %L3
+
+L3:
+  %4 = phi i32 [ %0, %for.body ], [ %add11, %L2 ]
+  %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 %i
+  %5 = load i32, i32* %arrayidx13
+  %mul14 = mul nsw i32 %5, %4
+  %add16 = add nsw i32 %mul14, %4
+  store i32 %add16, i32* %arrayidx
+  br label %L4
+
+L4:
+  %6 = phi i32 [ %0, %for.body ], [ %add16, %L3 ]
+  %arrayidx17 = getelementptr inbounds i32, i32* %c, i64 %i
+  %7 = load i32, i32* %arrayidx17
+  %mul19 = mul nsw i32 %7, %7
+  %add21 = add nsw i32 %6, %mul19
+  store i32 %add21, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; This test should not replace the switch statement as multiple cases have the same destination block
+define dso_local void @switch2(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i64 %N) {
+; CHECK-LABEL: @switch2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    switch i32 [[TMP0]], label [[L1:%.*]] [
+; CHECK-NEXT:    i32 4, label [[L3]]
+; CHECK-NEXT:    i32 2, label [[L2:%.*]]
+; CHECK-NEXT:    i32 3, label [[L3]]
+; CHECK-NEXT:    ]
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L3 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 4, label %L3
+  i32 2, label %L2
+  i32 3, label %L3
+  ]
+
+L1:
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx5
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %0
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %2 = phi i32 [ %0, %for.body ], [ %add, %L1 ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %3 = load i32, i32* %arrayidx7
+  %mul9 = mul nsw i32 %3, %3
+  %add11 = add nsw i32 %2, %mul9
+  store i32 %add11, i32* %arrayidx
+  br label %L3
+
+L3:
+  %4 = phi i32 [ %0, %for.body ], [ %0, %for.body ], [ %add11, %L2 ]
+  %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 %i
+  %5 = load i32, i32* %arrayidx13
+  %mul14 = mul nsw i32 %5, %4
+  %add16 = add nsw i32 %mul14, %4
+  store i32 %add16, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 15, i32 5, i32 10, i32 2}
+; CHECK: !0 = !{!"branch_weights", i32 5, i32 27}
+; CHECK: !1 = !{!"branch_weights", i32 10, i32 17}
+; CHECK: !2 = !{!"branch_weights", i32 2, i32 15}