Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -41,17 +41,17 @@
   VPBasicBlock *BB = nullptr;
   VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
 
-  VPInstruction *createInstruction(unsigned Opcode,
+  VPInstruction *createInstruction(unsigned Opcode, Type *T,
                                    ArrayRef<VPValue *> Operands) {
-    VPInstruction *Instr = new VPInstruction(Opcode, Operands);
+    VPInstruction *Instr = new VPInstruction(Opcode, T, Operands);
     if (BB)
       BB->insert(Instr, InsertPt);
     return Instr;
   }
 
-  VPInstruction *createInstruction(unsigned Opcode,
+  VPInstruction *createInstruction(unsigned Opcode, Type *T,
                                    std::initializer_list<VPValue *> Operands) {
-    return createInstruction(Opcode, ArrayRef<VPValue *>(Operands));
+    return createInstruction(Opcode, T, ArrayRef<VPValue *>(Operands));
   }
 
 public:
@@ -118,28 +118,28 @@
 
   /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
   /// its underlying Instruction.
-  VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
+  VPValue *createNaryOp(unsigned Opcode, Type *T, ArrayRef<VPValue *> Operands,
                         Instruction *Inst = nullptr) {
-    VPInstruction *NewVPInst = createInstruction(Opcode, Operands);
+    VPInstruction *NewVPInst = createInstruction(Opcode, T, Operands);
     NewVPInst->setUnderlyingValue(Inst);
     return NewVPInst;
   }
-  VPValue *createNaryOp(unsigned Opcode,
+  VPValue *createNaryOp(unsigned Opcode, Type *T,
                         std::initializer_list<VPValue *> Operands,
                         Instruction *Inst = nullptr) {
-    return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst);
+    return createNaryOp(Opcode, T, ArrayRef<VPValue *>(Operands), Inst);
   }
 
-  VPValue *createNot(VPValue *Operand) {
-    return createInstruction(VPInstruction::Not, {Operand});
+  VPValue *createNot(VPValue *Operand, Type *T) {
+    return createInstruction(VPInstruction::Not, T, {Operand});
   }
 
-  VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
-    return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
+  VPValue *createAnd(VPValue *LHS, VPValue *RHS, Type *T) {
+    return createInstruction(Instruction::BinaryOps::And, T, {LHS, RHS});
   }
 
-  VPValue *createOr(VPValue *LHS, VPValue *RHS) {
-    return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
+  VPValue *createOr(VPValue *LHS, VPValue *RHS, Type *T) {
+    return createInstruction(Instruction::BinaryOps::Or, T, {LHS, RHS});
   }
 
   //===--------------------------------------------------------------------===//
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7317,10 +7317,11 @@
   assert(EdgeMask && "No Edge Mask found for condition");
 
   if (BI->getSuccessor(0) != Dst)
-    EdgeMask = Builder.createNot(EdgeMask);
+    EdgeMask = Builder.createNot(EdgeMask, Type::getInt1Ty(Src->getContext()));
 
   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
-    EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
+    EdgeMask = Builder.createAnd(EdgeMask, SrcMask,
+                                 Type::getInt1Ty(Src->getContext()));
 
   return EdgeMaskCache[Edge] = EdgeMask;
 }
@@ -7365,9 +7366,11 @@
       // as a second argument, we only pass the IV here and extract the
       // tripcount from the transform state where codegen of the VP instructions
       // happen.
-      BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
+      BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
+                                       Type::getInt1Ty(BB->getContext()), {IV});
     } else {
-      BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
+      BlockMask = Builder.createNaryOp(
+          VPInstruction::ICmpULE, Type::getInt1Ty(BB->getContext()), {IV, BTC});
     }
     return BlockMaskCache[BB] = BlockMask;
   }
@@ -7383,7 +7386,8 @@
       continue;
     }
 
-    BlockMask = Builder.createOr(BlockMask, EdgeMask);
+    BlockMask = Builder.createOr(BlockMask, EdgeMask,
+                                 Type::getInt1Ty(BB->getContext()));
   }
 
   return BlockMaskCache[BB] = BlockMask;
@@ -7913,7 +7917,8 @@
         continue;
       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
-      Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
+      Builder.createNaryOp(Instruction::Select, Reduction.first->getType(),
+                           {Cond, Red, Phi});
     }
   }
 
@@ -7952,7 +7957,7 @@
     Plan->addVF(ElementCount::getFixed(VF));
 
   if (EnableVPlanPredication) {
-    VPlanPredicator VPP(*Plan);
+    VPlanPredicator VPP(*Plan, OrigLoop->getHeader()->getContext());
     VPP.predicate();
 
     // Avoid running transformation to recipes until masked code generation in
@@ -8438,9 +8443,34 @@
 }
 
 VectorizationCostTy VPInstruction::cost(ElementCount VF, VPCostContext &Ctx) {
-  // FIXME: Cost everything that a VPInstruction can be, which likely needs type
-  // information.
-  return {0, false};
+  if (Instruction::isBinaryOp(getOpcode()))
+    return {Ctx.CM.TTI.getArithmeticInstrCost(getOpcode(), getVectorType(VF)),
+            false};
+
+  switch (getOpcode()) {
+  case VPInstruction::Not:
+    // FIXME: For consistency with the old cost functions we mark this as zero
+    // cost.
+    return {0, false};
+  case VPInstruction::ICmpULE:
+    return {Ctx.CM.TTI.getCmpSelInstrCost(Instruction::ICmp, getVectorType(VF)),
+            false};
+  case Instruction::Select:
+    // FIXME: This is usually suspected to end up outside the loop (for tail
+    // folded reductions), or be freely folded into another instruction. That
+    // might not be true in all uses of the instruction in the future.
+    return {0, false};
+  case VPInstruction::ActiveLaneMask: {
+    ArrayRef<Type *> Ops;
+    IntrinsicCostAttributes CostAttrs(Intrinsic::get_active_lane_mask,
+                                      getVectorType(VF), Ops);
+    return {Ctx.CM.TTI.getIntrinsicInstrCost(
+                CostAttrs, TargetTransformInfo::TCK_RecipThroughput),
+            false};
+  }
+  default:
+    llvm_unreachable("Unsupported opcode for VPInstruction::cost");
+  }
 }
 
 // Determine how to lower the scalar epilogue, which depends on 1) optimising
Index: llvm/lib/Transforms/Vectorize/VPlan.h
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.h
+++ llvm/lib/Transforms/Vectorize/VPlan.h
@@ -762,6 +762,8 @@
   typedef unsigned char OpcodeTy;
   OpcodeTy Opcode;
 
+  Type *ScalarType;
+
   /// Utility method serving execute(): generates a single instance of the
   /// modeled instruction.
   void generateInstruction(VPTransformState &State, unsigned Part);
@@ -770,12 +772,16 @@
   void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
 
 public:
-  VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands)
+  VPInstruction(unsigned Opcode, Type *T, ArrayRef<VPValue *> Operands)
       : VPUser(Operands), VPValue(VPValue::VPInstructionSC),
-        VPRecipeBase(VPRecipeBase::VPInstructionSC), Opcode(Opcode) {}
+        VPRecipeBase(VPRecipeBase::VPInstructionSC), Opcode(Opcode),
+        ScalarType(T) {}
 
-  VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
-      : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {}
+  VPInstruction(unsigned Opcode, Type *T,
+                std::initializer_list<VPValue *> Operands)
+      : VPUser(Operands), VPValue(VPValue::VPInstructionSC),
+        VPRecipeBase(VPRecipeBase::VPInstructionSC), Opcode(Opcode),
+        ScalarType(T) {}
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPValue *V) {
@@ -784,7 +790,7 @@
 
   VPInstruction *clone() const {
     SmallVector<VPValue *, 2> Operands(operands());
-    return new VPInstruction(Opcode, Operands);
+    return new VPInstruction(Opcode, ScalarType, Operands);
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -792,6 +798,11 @@
     return R->getVPRecipeID() == VPRecipeBase::VPInstructionSC;
   }
 
+  Type *getScalarType() const { return ScalarType; }
+  Type *getVectorType(ElementCount VF) const {
+    return VectorType::get(ScalarType, VF);
+  }
+
   unsigned getOpcode() const { return Opcode; }
 
   /// Generate the instruction.
Index: llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -216,7 +216,7 @@
       // an empty VPInstruction that we will fix once the whole plain CFG has
       // been built.
       NewVPInst = cast<VPInstruction>(VPIRBuilder.createNaryOp(
-          Inst->getOpcode(), {} /*No operands*/, Inst));
+          Inst->getOpcode(), Inst->getType(), {} /*No operands*/, Inst));
       PhisToFix.push_back(Phi);
     } else {
       // Translate LLVM-IR operands into VPValue operands and set them in the
@@ -227,8 +227,8 @@
 
       // Build VPInstruction for any arbitraty Instruction without specific
       // representation in VPlan.
-      NewVPInst = cast<VPInstruction>(
-          VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst));
+      NewVPInst = cast<VPInstruction>(VPIRBuilder.createNaryOp(
+          Inst->getOpcode(), Inst->getType(), VPOperands, Inst));
     }
 
     IRDef2VPValue[Inst] = NewVPInst;
Index: llvm/lib/Transforms/Vectorize/VPlanPredicator.h
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlanPredicator.h
+++ llvm/lib/Transforms/Vectorize/VPlanPredicator.h
@@ -40,6 +40,9 @@
   // VPlan builder used to generate VPInstructions for block predicates.
   VPBuilder Builder;
 
+  // Current LLVM context for creating types.
+  LLVMContext &Ctx;
+
   /// Get the type of edge from \p FromBlock to \p ToBlock. Returns TRUE_EDGE if
   /// \p ToBlock is either the unconditional successor or the conditional true
   /// successor of \p FromBlock and FALSE_EDGE otherwise.
@@ -65,7 +68,7 @@
   void linearizeRegionRec(VPRegionBlock *Region);
 
 public:
-  VPlanPredicator(VPlan &Plan);
+  VPlanPredicator(VPlan &Plan, LLVMContext &Ctx);
 
   /// Predicate Plan's HCFG.
   void predicate(void);
Index: llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -50,14 +50,14 @@
 
   case EdgeType::FALSE_EDGE:
     // CurrBB is the False successor of PredBB - compute not of CBV.
-    IntermediateVal = Builder.createNot(CBV);
+    IntermediateVal = Builder.createNot(CBV, Type::getInt1Ty(Ctx));
     break;
   }
 
   // Now AND intermediate value with PredBB's block predicate if it has one.
   VPValue *BP = PredBB->getPredicate();
   if (BP)
-    return Builder.createAnd(BP, IntermediateVal);
+    return Builder.createAnd(BP, IntermediateVal, Type::getInt1Ty(Ctx));
   else
     return IntermediateVal;
 }
@@ -96,7 +96,7 @@
     Worklist.pop_front();
 
     // Create an OR of these values.
-    VPValue *Or = Builder.createOr(LHS, RHS);
+    VPValue *Or = Builder.createOr(LHS, RHS, Type::getInt1Ty(Ctx));
 
     // Push OR to the back of the worklist.
     Worklist.push_back(Or);
@@ -239,8 +239,8 @@
   linearizeRegionRec(cast<VPRegionBlock>(Plan.getEntry()));
 }
 
-VPlanPredicator::VPlanPredicator(VPlan &Plan)
-    : Plan(Plan), VPLI(&(Plan.getVPLoopInfo())) {
+VPlanPredicator::VPlanPredicator(VPlan &Plan, LLVMContext &Ctx)
+    : Plan(Plan), VPLI(&(Plan.getVPLoopInfo())), Ctx(Ctx) {
   // FIXME: Predicator is currently computing the dominator information for the
   // top region. Once we start storing dominator information in a VPRegionBlock,
   // we can avoid this recalculation.
Index: llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -412,7 +412,7 @@
         LLVM_DEBUG(dbgs() << "    Adding multinode Ops\n");
         // Create dummy VPInstruction, which will we replace later by the
         // re-ordered operand.
-        VPInstruction *Op = new VPInstruction(0, {});
+        VPInstruction *Op = new VPInstruction(0, nullptr, {});
         CombinedOperands.push_back(Op);
         MultiNodeOps.emplace_back(Op, Operands);
       }
@@ -463,7 +463,8 @@
     return markFailed();
 
   assert(CombinedOperands.size() > 0 && "Need more some operands");
-  auto *VPI = new VPInstruction(Opcode, CombinedOperands);
+  auto *VPI =
+      new VPInstruction(Opcode, nullptr, CombinedOperands); // FIXME Type info?
   VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
 
   LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs());
Index: llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -61,7 +61,7 @@
 ; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !5, !noalias !7
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
 ; AVX1-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; AVX1-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
 ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -84,7 +84,7 @@
 ; AVX1:       for.inc:
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10
+; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP10:!llvm.loop !.*]]
 ; AVX1:       for.end:
 ; AVX1-NEXT:    ret void
 ;
@@ -176,7 +176,7 @@
 ; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !5, !noalias !7
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
 ; AVX2-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX2-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; AVX2-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -199,7 +199,7 @@
 ; AVX2:       for.inc:
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10
+; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP10:!llvm.loop !.*]]
 ; AVX2:       for.end:
 ; AVX2-NEXT:    ret void
 ;
@@ -291,7 +291,7 @@
 ; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], <16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !5, !noalias !7
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 64
 ; AVX512-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX512-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; AVX512-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -314,7 +314,7 @@
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP10:!llvm.loop !.*]]
 ; AVX512:       for.end:
 ; AVX512-NEXT:    ret void
 ;
@@ -391,7 +391,7 @@
 ; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP8]], <8 x i32> addrspace(1)* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !16, !noalias !18
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
 ; AVX1-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19
+; AVX1-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
 ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -414,7 +414,7 @@
 ; AVX1:       for.inc:
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20
+; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP20:!llvm.loop !.*]]
 ; AVX1:       for.end:
 ; AVX1-NEXT:    ret void
 ;
@@ -506,7 +506,7 @@
 ; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !16, !noalias !18
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
 ; AVX2-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX2-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19
+; AVX2-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -529,7 +529,7 @@
 ; AVX2:       for.inc:
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20
+; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP20:!llvm.loop !.*]]
 ; AVX2:       for.end:
 ; AVX2-NEXT:    ret void
 ;
@@ -621,7 +621,7 @@
 ; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !16, !noalias !18
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 64
 ; AVX512-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX512-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19
+; AVX512-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -644,7 +644,7 @@
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP20:!llvm.loop !.*]]
 ; AVX512:       for.end:
 ; AVX512-NEXT:    ret void
 ;
@@ -731,7 +731,7 @@
 ; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP12]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !26, !noalias !28
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
 ; AVX1-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !29
+; AVX1-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP29:!llvm.loop !.*]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
 ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -755,7 +755,7 @@
 ; AVX1:       for.inc:
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !30
+; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP30:!llvm.loop !.*]]
 ; AVX1:       for.end:
 ; AVX1-NEXT:    ret void
 ;
@@ -851,7 +851,7 @@
 ; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !26, !noalias !28
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
 ; AVX2-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX2-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !29
+; AVX2-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP29:!llvm.loop !.*]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -875,7 +875,7 @@
 ; AVX2:       for.inc:
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !30
+; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP30:!llvm.loop !.*]]
 ; AVX2:       for.end:
 ; AVX2-NEXT:    ret void
 ;
@@ -971,7 +971,7 @@
 ; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !26, !noalias !28
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 64
 ; AVX512-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX512-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !29
+; AVX512-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP29:!llvm.loop !.*]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -995,7 +995,7 @@
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !30
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP30:!llvm.loop !.*]]
 ; AVX512:       for.end:
 ; AVX512-NEXT:    ret void
 ;
@@ -1131,7 +1131,7 @@
 ; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !36, !noalias !38
 ; AVX-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; AVX-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; AVX-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !39
+; AVX-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP39:!llvm.loop !.*]]
 ; AVX:       middle.block:
 ; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
 ; AVX-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1155,7 +1155,7 @@
 ; AVX:       for.inc:
 ; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40
+; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP40:!llvm.loop !.*]]
 ; AVX:       for.end:
 ; AVX-NEXT:    ret void
 ;
@@ -1251,7 +1251,7 @@
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP39]], <8 x double>* [[TMP51]], i32 8, <8 x i1> [[TMP19]]), !alias.scope !36, !noalias !38
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX512-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !39
+; AVX512-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP39:!llvm.loop !.*]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1275,7 +1275,7 @@
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP40:!llvm.loop !.*]]
 ; AVX512:       for.end:
 ; AVX512-NEXT:    ret void
 ;
@@ -1384,7 +1384,7 @@
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128>
 ; AVX512-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624
-; AVX512-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !49
+; AVX512-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP49:!llvm.loop !.*]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 625, 624
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1409,7 +1409,7 @@
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
 ; AVX512-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !50
+; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP50:!llvm.loop !.*]]
 ; AVX512:       for.end:
 ; AVX512-NEXT:    ret void
 ;
@@ -1664,7 +1664,7 @@
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; AVX2-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !49
+; AVX2-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP49:!llvm.loop !.*]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1687,7 +1687,7 @@
 ; AVX2:       for.inc:
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
 ; AVX2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
-; AVX2-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !50
+; AVX2-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP50:!llvm.loop !.*]]
 ; AVX2:       for.end:
 ; AVX2-NEXT:    ret void
 ;
@@ -1808,7 +1808,7 @@
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !56, !noalias !58
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; AVX512-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !59
+; AVX512-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP59:!llvm.loop !.*]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1831,7 +1831,7 @@
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
 ; AVX512-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
-; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !60
+; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP60:!llvm.loop !.*]]
 ; AVX512:       for.end:
 ; AVX512-NEXT:    ret void
 ;
@@ -1877,93 +1877,36 @@
 ; AVX1-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
 ; AVX1:       for.body.preheader:
 ; AVX1-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
-; AVX1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; AVX1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
 ; AVX1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; AVX1:       vector.ph:
-; AVX1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
+; AVX1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
 ; AVX1-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; AVX1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
-; AVX1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
-; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
-; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
-; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>*
-; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1
-; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4
-; AVX1-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>*
-; AVX1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1
-; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
-; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>*
-; AVX1-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
-; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12
-; AVX1-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>*
-; AVX1-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1
-; AVX1-NEXT:    [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
-; AVX1-NEXT:    [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
-; AVX1-NEXT:    [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
-; AVX1-NEXT:    [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
-; AVX1-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer
-; AVX1-NEXT:    [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer
-; AVX1-NEXT:    [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer
-; AVX1-NEXT:    [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
-; AVX1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP1]]
-; AVX1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP3]]
-; AVX1-NEXT:    [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 0
-; AVX1-NEXT:    [[TMP33:%.*]] = bitcast double** [[TMP32]] to <4 x double*>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x double*> undef)
-; AVX1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 4
-; AVX1-NEXT:    [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x double*> undef)
-; AVX1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 8
-; AVX1-NEXT:    [[TMP37:%.*]] = bitcast double** [[TMP36]] to <4 x double*>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x double*> undef)
-; AVX1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 12
-; AVX1-NEXT:    [[TMP39:%.*]] = bitcast double** [[TMP38]] to <4 x double*>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x double*> undef)
-; AVX1-NEXT:    [[TMP40:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
-; AVX1-NEXT:    [[TMP41:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer
-; AVX1-NEXT:    [[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer
-; AVX1-NEXT:    [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer
-; AVX1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]]
-; AVX1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]]
-; AVX1-NEXT:    [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP52:%.*]] = and <4 x i1> [[TMP48]], [[TMP28]]
-; AVX1-NEXT:    [[TMP53:%.*]] = and <4 x i1> [[TMP49]], [[TMP29]]
-; AVX1-NEXT:    [[TMP54:%.*]] = and <4 x i1> [[TMP50]], [[TMP30]]
-; AVX1-NEXT:    [[TMP55:%.*]] = and <4 x i1> [[TMP51]], [[TMP31]]
-; AVX1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0
-; AVX1-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
-; AVX1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 4
-; AVX1-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]])
-; AVX1-NEXT:    [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8
-; AVX1-NEXT:    [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]])
-; AVX1-NEXT:    [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 12
-; AVX1-NEXT:    [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>*
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
-; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
-; AVX1-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX1-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !41
+; AVX1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
+; AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
+; AVX1-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
+; AVX1-NEXT:    [[TMP4:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
+; AVX1-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[TMP0]]
+; AVX1-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double*, double** [[TMP6]], i32 0
+; AVX1-NEXT:    [[TMP9:%.*]] = bitcast double** [[TMP8]] to <4 x double*>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP9]], i32 8, <4 x i1> [[TMP7]], <4 x double*> undef)
+; AVX1-NEXT:    [[TMP10:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]]
+; AVX1-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[TMP10]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP13:%.*]] = and <4 x i1> [[TMP12]], [[TMP7]]
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP11]], i32 0
+; AVX1-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP13]])
+; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; AVX1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX1-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP41:!llvm.loop !.*]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -1973,14 +1916,14 @@
 ; AVX1:       for.body:
 ; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX1-NEXT:    [[TMP66:%.*]] = and i8 [[TMP65]], 1
-; AVX1-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
+; AVX1-NEXT:    [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX1-NEXT:    [[TMP18:%.*]] = and i8 [[TMP17]], 1
+; AVX1-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP18]], 0
 ; AVX1-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
 ; AVX1:       land.lhs.true:
 ; AVX1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP67:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
-; AVX1-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP67]], null
+; AVX1-NEXT:    [[TMP19:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
+; AVX1-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP19]], null
 ; AVX1-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; AVX1:       if.then:
 ; AVX1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
@@ -1989,7 +1932,7 @@
 ; AVX1:       for.inc:
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !42
+; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP42:!llvm.loop !.*]]
 ; AVX1:       for.end.loopexit:
 ; AVX1-NEXT:    br label [[FOR_END]]
 ; AVX1:       for.end:
@@ -2087,7 +2030,7 @@
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX2-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51
+; AVX2-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP51:!llvm.loop !.*]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2113,7 +2056,7 @@
 ; AVX2:       for.inc:
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !52
+; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP52:!llvm.loop !.*]]
 ; AVX2:       for.end.loopexit:
 ; AVX2-NEXT:    br label [[FOR_END]]
 ; AVX2:       for.end:
@@ -2211,7 +2154,7 @@
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX512-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !61
+; AVX512-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP61:!llvm.loop !.*]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2237,7 +2180,7 @@
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !62
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP62:!llvm.loop !.*]]
 ; AVX512:       for.end.loopexit:
 ; AVX512-NEXT:    br label [[FOR_END]]
 ; AVX512:       for.end:
@@ -2294,93 +2237,36 @@
 ; AVX1-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
 ; AVX1:       for.body.preheader:
 ; AVX1-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
-; AVX1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; AVX1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
 ; AVX1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; AVX1:       vector.ph:
-; AVX1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
+; AVX1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
 ; AVX1-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; AVX1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
-; AVX1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
-; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
-; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
-; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>*
-; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1
-; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4
-; AVX1-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>*
-; AVX1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1
-; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
-; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>*
-; AVX1-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
-; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12
-; AVX1-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>*
-; AVX1-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1
-; AVX1-NEXT:    [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
-; AVX1-NEXT:    [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
-; AVX1-NEXT:    [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
-; AVX1-NEXT:    [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
-; AVX1-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer
-; AVX1-NEXT:    [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer
-; AVX1-NEXT:    [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer
-; AVX1-NEXT:    [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
-; AVX1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP1]]
-; AVX1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP3]]
-; AVX1-NEXT:    [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 0
-; AVX1-NEXT:    [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <4 x i32 ()*>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x i32 ()*> undef)
-; AVX1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 4
-; AVX1-NEXT:    [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x i32 ()*> undef)
-; AVX1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 8
-; AVX1-NEXT:    [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <4 x i32 ()*>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x i32 ()*> undef)
-; AVX1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 12
-; AVX1-NEXT:    [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <4 x i32 ()*>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x i32 ()*> undef)
-; AVX1-NEXT:    [[TMP40:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
-; AVX1-NEXT:    [[TMP41:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer
-; AVX1-NEXT:    [[TMP42:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer
-; AVX1-NEXT:    [[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer
-; AVX1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]]
-; AVX1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]]
-; AVX1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]]
-; AVX1-NEXT:    [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP52:%.*]] = and <4 x i1> [[TMP48]], [[TMP28]]
-; AVX1-NEXT:    [[TMP53:%.*]] = and <4 x i1> [[TMP49]], [[TMP29]]
-; AVX1-NEXT:    [[TMP54:%.*]] = and <4 x i1> [[TMP50]], [[TMP30]]
-; AVX1-NEXT:    [[TMP55:%.*]] = and <4 x i1> [[TMP51]], [[TMP31]]
-; AVX1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0
-; AVX1-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
-; AVX1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 4
-; AVX1-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]])
-; AVX1-NEXT:    [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8
-; AVX1-NEXT:    [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]])
-; AVX1-NEXT:    [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 12
-; AVX1-NEXT:    [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>*
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
-; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
-; AVX1-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX1-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !44
+; AVX1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
+; AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
+; AVX1-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
+; AVX1-NEXT:    [[TMP4:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
+; AVX1-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]]
+; AVX1-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP6]], i32 0
+; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32 ()** [[TMP8]] to <4 x i32 ()*>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP9]], i32 8, <4 x i1> [[TMP7]], <4 x i32 ()*> undef)
+; AVX1-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]]
+; AVX1-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[TMP10]], <i1 true, i1 true, i1 true, i1 true>
+; AVX1-NEXT:    [[TMP13:%.*]] = and <4 x i1> [[TMP12]], [[TMP7]]
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP11]], i32 0
+; AVX1-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP13]])
+; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; AVX1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX1-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP44:!llvm.loop !.*]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2390,14 +2276,14 @@
 ; AVX1:       for.body:
 ; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX1-NEXT:    [[TMP66:%.*]] = and i8 [[TMP65]], 1
-; AVX1-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
+; AVX1-NEXT:    [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX1-NEXT:    [[TMP18:%.*]] = and i8 [[TMP17]], 1
+; AVX1-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP18]], 0
 ; AVX1-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
 ; AVX1:       land.lhs.true:
 ; AVX1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP67:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
-; AVX1-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP67]], null
+; AVX1-NEXT:    [[TMP19:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
+; AVX1-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP19]], null
 ; AVX1-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; AVX1:       if.then:
 ; AVX1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
@@ -2406,7 +2292,7 @@
 ; AVX1:       for.inc:
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !45
+; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP45:!llvm.loop !.*]]
 ; AVX1:       for.end.loopexit:
 ; AVX1-NEXT:    br label [[FOR_END]]
 ; AVX1:       for.end:
@@ -2504,7 +2390,7 @@
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX2-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54
+; AVX2-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP54:!llvm.loop !.*]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2530,7 +2416,7 @@
 ; AVX2:       for.inc:
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !55
+; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP55:!llvm.loop !.*]]
 ; AVX2:       for.end.loopexit:
 ; AVX2-NEXT:    br label [[FOR_END]]
 ; AVX2:       for.end:
@@ -2628,7 +2514,7 @@
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX512-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !64
+; AVX512-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP64:!llvm.loop !.*]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2654,7 +2540,7 @@
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !65
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP65:!llvm.loop !.*]]
 ; AVX512:       for.end.loopexit:
 ; AVX512-NEXT:    br label [[FOR_END]]
 ; AVX512:       for.end:
Index: llvm/unittests/Transforms/Vectorize/VPlanPredicatorTest.cpp
===================================================================
--- llvm/unittests/Transforms/Vectorize/VPlanPredicatorTest.cpp
+++ llvm/unittests/Transforms/Vectorize/VPlanPredicatorTest.cpp
@@ -77,7 +77,7 @@
   VPValue *CBV2 = OuterIf->getCondBit();
 
   // Apply predication.
-  VPlanPredicator VPP(*Plan);
+  VPlanPredicator VPP(*Plan, F->getContext());
   VPP.predicate();
 
   VPBlockBase *InnerLoopLinSucc = InnerLoopH->getSingleSuccessor();
@@ -181,7 +181,7 @@
   VPValue *InnerCBV = InnerIfCmpBlk->getCondBit();
 
   // Apply predication.
-  VPlanPredicator VPP(*Plan);
+  VPlanPredicator VPP(*Plan, F->getContext());
   VPP.predicate();
 
   VPInstruction *And =
Index: llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
===================================================================
--- llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -26,9 +26,9 @@
   } while (0)
 
 TEST(VPInstructionTest, insertBefore) {
-  VPInstruction *I1 = new VPInstruction(0, {});
-  VPInstruction *I2 = new VPInstruction(1, {});
-  VPInstruction *I3 = new VPInstruction(2, {});
+  VPInstruction *I1 = new VPInstruction(0, nullptr, {});
+  VPInstruction *I2 = new VPInstruction(1, nullptr, {});
+  VPInstruction *I3 = new VPInstruction(2, nullptr, {});
 
   VPBasicBlock VPBB1;
   VPBB1.appendRecipe(I1);
@@ -41,9 +41,9 @@
 }
 
 TEST(VPInstructionTest, eraseFromParent) {
-  VPInstruction *I1 = new VPInstruction(0, {});
-  VPInstruction *I2 = new VPInstruction(1, {});
-  VPInstruction *I3 = new VPInstruction(2, {});
+  VPInstruction *I1 = new VPInstruction(0, nullptr, {});
+  VPInstruction *I2 = new VPInstruction(1, nullptr, {});
+  VPInstruction *I3 = new VPInstruction(2, nullptr, {});
 
   VPBasicBlock VPBB1;
   VPBB1.appendRecipe(I1);
@@ -61,9 +61,9 @@
 }
 
 TEST(VPInstructionTest, moveAfter) {
-  VPInstruction *I1 = new VPInstruction(0, {});
-  VPInstruction *I2 = new VPInstruction(1, {});
-  VPInstruction *I3 = new VPInstruction(2, {});
+  VPInstruction *I1 = new VPInstruction(0, nullptr, {});
+  VPInstruction *I2 = new VPInstruction(1, nullptr, {});
+  VPInstruction *I3 = new VPInstruction(2, nullptr, {});
 
   VPBasicBlock VPBB1;
   VPBB1.appendRecipe(I1);
@@ -74,8 +74,8 @@
 
   CHECK_ITERATOR(VPBB1, I2, I1, I3);
 
-  VPInstruction *I4 = new VPInstruction(4, {});
-  VPInstruction *I5 = new VPInstruction(5, {});
+  VPInstruction *I4 = new VPInstruction(4, nullptr, {});
+  VPInstruction *I5 = new VPInstruction(5, nullptr, {});
   VPBasicBlock VPBB2;
   VPBB2.appendRecipe(I4);
   VPBB2.appendRecipe(I5);
@@ -90,7 +90,7 @@
 TEST(VPInstructionTest, setOperand) {
   VPValue *VPV1 = new VPValue();
   VPValue *VPV2 = new VPValue();
-  VPInstruction *I1 = new VPInstruction(0, {VPV1, VPV2});
+  VPInstruction *I1 = new VPInstruction(0, nullptr, {VPV1, VPV2});
   EXPECT_EQ(1u, VPV1->getNumUsers());
   EXPECT_EQ(I1, *VPV1->user_begin());
   EXPECT_EQ(1u, VPV2->getNumUsers());
@@ -136,7 +136,7 @@
 TEST(VPInstructionTest, replaceAllUsesWith) {
   VPValue *VPV1 = new VPValue();
   VPValue *VPV2 = new VPValue();
-  VPInstruction *I1 = new VPInstruction(0, {VPV1, VPV2});
+  VPInstruction *I1 = new VPInstruction(0, nullptr, {VPV1, VPV2});
 
   // Replace all uses of VPV1 with VPV3.
   VPValue *VPV3 = new VPValue();
@@ -167,7 +167,7 @@
   EXPECT_EQ(0u, VPV2->getNumUsers());
   EXPECT_EQ(0u, VPV3->getNumUsers());
 
-  VPInstruction *I2 = new VPInstruction(0, {VPV1, VPV2});
+  VPInstruction *I2 = new VPInstruction(0, nullptr, {VPV1, VPV2});
   EXPECT_EQ(3u, VPV1->getNumUsers());
   VPV1->replaceAllUsesWith(VPV3);
   EXPECT_EQ(3u, VPV3->getNumUsers());
@@ -182,7 +182,7 @@
 TEST(VPInstructionTest, releaseOperandsAtDeletion) {
   VPValue *VPV1 = new VPValue();
   VPValue *VPV2 = new VPValue();
-  VPInstruction *I1 = new VPInstruction(0, {VPV1, VPV2});
+  VPInstruction *I1 = new VPInstruction(0, nullptr, {VPV1, VPV2});
 
   EXPECT_EQ(1u, VPV1->getNumUsers());
   EXPECT_EQ(I1, *VPV1->user_begin());
@@ -288,17 +288,17 @@
 }
 
 TEST(VPBasicBlockTest, print) {
-  VPInstruction *I1 = new VPInstruction(Instruction::Add, {});
-  VPInstruction *I2 = new VPInstruction(Instruction::Sub, {I1});
-  VPInstruction *I3 = new VPInstruction(Instruction::Br, {I1, I2});
+  VPInstruction *I1 = new VPInstruction(Instruction::Add, nullptr, {});
+  VPInstruction *I2 = new VPInstruction(Instruction::Sub, nullptr, {I1});
+  VPInstruction *I3 = new VPInstruction(Instruction::Br, nullptr, {I1, I2});
 
   VPBasicBlock *VPBB1 = new VPBasicBlock();
   VPBB1->appendRecipe(I1);
   VPBB1->appendRecipe(I2);
   VPBB1->appendRecipe(I3);
 
-  VPInstruction *I4 = new VPInstruction(Instruction::Mul, {I2, I1});
-  VPInstruction *I5 = new VPInstruction(Instruction::Ret, {I4});
+  VPInstruction *I4 = new VPInstruction(Instruction::Mul, nullptr, {I2, I1});
+  VPInstruction *I5 = new VPInstruction(Instruction::Ret, nullptr, {I4});
   VPBasicBlock *VPBB2 = new VPBasicBlock();
   VPBB2->appendRecipe(I4);
   VPBB2->appendRecipe(I5);
@@ -360,7 +360,7 @@
 TEST(VPRecipeTest, CastVPInstructionToVPUser) {
   VPValue Op1;
   VPValue Op2;
-  VPInstruction Recipe(Instruction::Add, {&Op1, &Op2});
+  VPInstruction Recipe(Instruction::Add, nullptr, {&Op1, &Op2});
   EXPECT_TRUE(isa<VPUser>(&Recipe));
   VPRecipeBase *BaseR = &Recipe;
   EXPECT_TRUE(isa<VPUser>(BaseR));