Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -41,17 +41,17 @@ VPBasicBlock *BB = nullptr; VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator(); - VPInstruction *createInstruction(unsigned Opcode, + VPInstruction *createInstruction(unsigned Opcode, Type *T, ArrayRef Operands) { - VPInstruction *Instr = new VPInstruction(Opcode, Operands); + VPInstruction *Instr = new VPInstruction(Opcode, T, Operands); if (BB) BB->insert(Instr, InsertPt); return Instr; } - VPInstruction *createInstruction(unsigned Opcode, + VPInstruction *createInstruction(unsigned Opcode, Type *T, std::initializer_list Operands) { - return createInstruction(Opcode, ArrayRef(Operands)); + return createInstruction(Opcode, T, ArrayRef(Operands)); } public: @@ -118,28 +118,28 @@ /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as /// its underlying Instruction. - VPValue *createNaryOp(unsigned Opcode, ArrayRef Operands, + VPValue *createNaryOp(unsigned Opcode, Type *T, ArrayRef Operands, Instruction *Inst = nullptr) { - VPInstruction *NewVPInst = createInstruction(Opcode, Operands); + VPInstruction *NewVPInst = createInstruction(Opcode, T, Operands); NewVPInst->setUnderlyingValue(Inst); return NewVPInst; } - VPValue *createNaryOp(unsigned Opcode, + VPValue *createNaryOp(unsigned Opcode, Type *T, std::initializer_list Operands, Instruction *Inst = nullptr) { - return createNaryOp(Opcode, ArrayRef(Operands), Inst); + return createNaryOp(Opcode, T, ArrayRef(Operands), Inst); } - VPValue *createNot(VPValue *Operand) { - return createInstruction(VPInstruction::Not, {Operand}); + VPValue *createNot(VPValue *Operand, Type *T) { + return createInstruction(VPInstruction::Not, T, {Operand}); } - VPValue *createAnd(VPValue *LHS, VPValue *RHS) { - return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}); + VPValue *createAnd(VPValue *LHS, VPValue *RHS, Type *T) { + return createInstruction(Instruction::BinaryOps::And, T, {LHS, RHS}); } - VPValue *createOr(VPValue *LHS, VPValue *RHS) { - return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}); + VPValue *createOr(VPValue *LHS, VPValue *RHS, Type *T) { + return createInstruction(Instruction::BinaryOps::Or, T, {LHS, RHS}); } //===--------------------------------------------------------------------===// Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7317,10 +7317,11 @@ assert(EdgeMask && "No Edge Mask found for condition"); if (BI->getSuccessor(0) != Dst) - EdgeMask = Builder.createNot(EdgeMask); + EdgeMask = Builder.createNot(EdgeMask, Type::getInt1Ty(Src->getContext())); if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. - EdgeMask = Builder.createAnd(EdgeMask, SrcMask); + EdgeMask = Builder.createAnd(EdgeMask, SrcMask, + Type::getInt1Ty(Src->getContext())); return EdgeMaskCache[Edge] = EdgeMask; } @@ -7365,9 +7366,11 @@ // as a second argument, we only pass the IV here and extract the // tripcount from the transform state where codegen of the VP instructions // happen. - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, + Type::getInt1Ty(BB->getContext()), {IV}); } else { - BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); + BlockMask = Builder.createNaryOp( + VPInstruction::ICmpULE, Type::getInt1Ty(BB->getContext()), {IV, BTC}); } return BlockMaskCache[BB] = BlockMask; } @@ -7383,7 +7386,8 @@ continue; } - BlockMask = Builder.createOr(BlockMask, EdgeMask); + BlockMask = Builder.createOr(BlockMask, EdgeMask, + Type::getInt1Ty(BB->getContext())); } return BlockMaskCache[BB] = BlockMask; @@ -7913,7 +7917,8 @@ continue; VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); - Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); + Builder.createNaryOp(Instruction::Select, Reduction.first->getType(), + {Cond, Red, Phi}); } } @@ -7952,7 +7957,7 @@ Plan->addVF(ElementCount::getFixed(VF)); if (EnableVPlanPredication) { - VPlanPredicator VPP(*Plan); + VPlanPredicator VPP(*Plan, OrigLoop->getHeader()->getContext()); VPP.predicate(); // Avoid running transformation to recipes until masked code generation in @@ -8438,9 +8443,34 @@ } VectorizationCostTy VPInstruction::cost(ElementCount VF, VPCostContext &Ctx) { - // FIXME: Cost everything that a VPInstruction can be, which likely needs type - // information. - return {0, false}; + if (Instruction::isBinaryOp(getOpcode())) + return {Ctx.CM.TTI.getArithmeticInstrCost(getOpcode(), getVectorType(VF)), + false}; + + switch (getOpcode()) { + case VPInstruction::Not: + // FIXME: For consistency with the old cost functions we mark this as zero + // cost. + return {0, false}; + case VPInstruction::ICmpULE: + return {Ctx.CM.TTI.getCmpSelInstrCost(Instruction::ICmp, getVectorType(VF)), + false}; + case Instruction::Select: + // FIXME: This is usually suspected to end up outside the loop (for tail + // folded reductions), or be freely folded into another instruction. That + // might not be true in all uses of the instruction in the future. + return {0, false}; + case VPInstruction::ActiveLaneMask: { + ArrayRef Ops; + IntrinsicCostAttributes CostAttrs(Intrinsic::get_active_lane_mask, + getVectorType(VF), Ops); + return {Ctx.CM.TTI.getIntrinsicInstrCost( + CostAttrs, TargetTransformInfo::TCK_RecipThroughput), + false}; + } + default: + llvm_unreachable("Unsupported opcode for VPInstruction::cost"); + } } // Determine how to lower the scalar epilogue, which depends on 1) optimising Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -762,6 +762,8 @@ typedef unsigned char OpcodeTy; OpcodeTy Opcode; + Type *ScalarType; + /// Utility method serving execute(): generates a single instance of the /// modeled instruction. void generateInstruction(VPTransformState &State, unsigned Part); @@ -770,12 +772,16 @@ void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); } public: - VPInstruction(unsigned Opcode, ArrayRef Operands) + VPInstruction(unsigned Opcode, Type *T, ArrayRef Operands) : VPUser(Operands), VPValue(VPValue::VPInstructionSC), - VPRecipeBase(VPRecipeBase::VPInstructionSC), Opcode(Opcode) {} + VPRecipeBase(VPRecipeBase::VPInstructionSC), Opcode(Opcode), + ScalarType(T) {} - VPInstruction(unsigned Opcode, std::initializer_list Operands) - : VPInstruction(Opcode, ArrayRef(Operands)) {} + VPInstruction(unsigned Opcode, Type *T, + std::initializer_list Operands) + : VPUser(Operands), VPValue(VPValue::VPInstructionSC), + VPRecipeBase(VPRecipeBase::VPInstructionSC), Opcode(Opcode), + ScalarType(T) {} /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPValue *V) { @@ -784,7 +790,7 @@ VPInstruction *clone() const { SmallVector Operands(operands()); - return new VPInstruction(Opcode, Operands); + return new VPInstruction(Opcode, ScalarType, Operands); } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -792,6 +798,11 @@ return R->getVPRecipeID() == VPRecipeBase::VPInstructionSC; } + Type *getScalarType() const { return ScalarType; } + Type *getVectorType(ElementCount VF) const { + return VectorType::get(ScalarType, VF); + } + unsigned getOpcode() const { return Opcode; } /// Generate the instruction. Index: llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -216,7 +216,7 @@ // an empty VPInstruction that we will fix once the whole plain CFG has // been built. NewVPInst = cast(VPIRBuilder.createNaryOp( - Inst->getOpcode(), {} /*No operands*/, Inst)); + Inst->getOpcode(), Inst->getType(), {} /*No operands*/, Inst)); PhisToFix.push_back(Phi); } else { // Translate LLVM-IR operands into VPValue operands and set them in the @@ -227,8 +227,8 @@ // Build VPInstruction for any arbitraty Instruction without specific // representation in VPlan. - NewVPInst = cast( - VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst)); + NewVPInst = cast(VPIRBuilder.createNaryOp( + Inst->getOpcode(), Inst->getType(), VPOperands, Inst)); } IRDef2VPValue[Inst] = NewVPInst; Index: llvm/lib/Transforms/Vectorize/VPlanPredicator.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanPredicator.h +++ llvm/lib/Transforms/Vectorize/VPlanPredicator.h @@ -40,6 +40,9 @@ // VPlan builder used to generate VPInstructions for block predicates. VPBuilder Builder; + // Current LLVM context for creating types. + LLVMContext &Ctx; + /// Get the type of edge from \p FromBlock to \p ToBlock. Returns TRUE_EDGE if /// \p ToBlock is either the unconditional successor or the conditional true /// successor of \p FromBlock and FALSE_EDGE otherwise. @@ -65,7 +68,7 @@ void linearizeRegionRec(VPRegionBlock *Region); public: - VPlanPredicator(VPlan &Plan); + VPlanPredicator(VPlan &Plan, LLVMContext &Ctx); /// Predicate Plan's HCFG. void predicate(void); Index: llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -50,14 +50,14 @@ case EdgeType::FALSE_EDGE: // CurrBB is the False successor of PredBB - compute not of CBV. - IntermediateVal = Builder.createNot(CBV); + IntermediateVal = Builder.createNot(CBV, Type::getInt1Ty(Ctx)); break; } // Now AND intermediate value with PredBB's block predicate if it has one. VPValue *BP = PredBB->getPredicate(); if (BP) - return Builder.createAnd(BP, IntermediateVal); + return Builder.createAnd(BP, IntermediateVal, Type::getInt1Ty(Ctx)); else return IntermediateVal; } @@ -96,7 +96,7 @@ Worklist.pop_front(); // Create an OR of these values. - VPValue *Or = Builder.createOr(LHS, RHS); + VPValue *Or = Builder.createOr(LHS, RHS, Type::getInt1Ty(Ctx)); // Push OR to the back of the worklist. Worklist.push_back(Or); @@ -239,8 +239,8 @@ linearizeRegionRec(cast(Plan.getEntry())); } -VPlanPredicator::VPlanPredicator(VPlan &Plan) - : Plan(Plan), VPLI(&(Plan.getVPLoopInfo())) { +VPlanPredicator::VPlanPredicator(VPlan &Plan, LLVMContext &Ctx) + : Plan(Plan), VPLI(&(Plan.getVPLoopInfo())), Ctx(Ctx) { // FIXME: Predicator is currently computing the dominator information for the // top region. Once we start storing dominator information in a VPRegionBlock, // we can avoid this recalculation. Index: llvm/lib/Transforms/Vectorize/VPlanSLP.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanSLP.cpp +++ llvm/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -412,7 +412,7 @@ LLVM_DEBUG(dbgs() << " Adding multinode Ops\n"); // Create dummy VPInstruction, which will we replace later by the // re-ordered operand. - VPInstruction *Op = new VPInstruction(0, {}); + VPInstruction *Op = new VPInstruction(0, nullptr, {}); CombinedOperands.push_back(Op); MultiNodeOps.emplace_back(Op, Operands); } @@ -463,7 +463,8 @@ return markFailed(); assert(CombinedOperands.size() > 0 && "Need more some operands"); - auto *VPI = new VPInstruction(Opcode, CombinedOperands); + auto *VPI = + new VPInstruction(Opcode, nullptr, CombinedOperands); // FIXME Type info? VPI->setUnderlyingInstr(cast(Values[0])->getUnderlyingInstr()); LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs()); Index: llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -61,7 +61,7 @@ ; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !5, !noalias !7 ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; AVX1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -84,7 +84,7 @@ ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10 +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP10:!llvm.loop !.*]] ; AVX1: for.end: ; AVX1-NEXT: ret void ; @@ -176,7 +176,7 @@ ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !5, !noalias !7 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -199,7 +199,7 @@ ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10 +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP10:!llvm.loop !.*]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -291,7 +291,7 @@ ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], <16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64 ; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -314,7 +314,7 @@ ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP10:!llvm.loop !.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -391,7 +391,7 @@ ; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP8]], <8 x i32> addrspace(1)* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !16, !noalias !18 ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; AVX1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19 +; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -414,7 +414,7 @@ ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20 +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP20:!llvm.loop !.*]] ; AVX1: for.end: ; AVX1-NEXT: ret void ; @@ -506,7 +506,7 @@ ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !16, !noalias !18 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19 +; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -529,7 +529,7 @@ ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20 +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP20:!llvm.loop !.*]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -621,7 +621,7 @@ ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !16, !noalias !18 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64 ; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19 +; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -644,7 +644,7 @@ ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP20:!llvm.loop !.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -731,7 +731,7 @@ ; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP12]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !26, !noalias !28 ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; AVX1-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX1-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !29 +; AVX1-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP29:!llvm.loop !.*]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -755,7 +755,7 @@ ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !30 +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP30:!llvm.loop !.*]] ; AVX1: for.end: ; AVX1-NEXT: ret void ; @@ -851,7 +851,7 @@ ; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !26, !noalias !28 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !29 +; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP29:!llvm.loop !.*]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -875,7 +875,7 @@ ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !30 +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP30:!llvm.loop !.*]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -971,7 +971,7 @@ ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !26, !noalias !28 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64 ; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !29 +; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP29:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -995,7 +995,7 @@ ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !30 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP30:!llvm.loop !.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1131,7 +1131,7 @@ ; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !36, !noalias !38 ; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !39 +; AVX-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP39:!llvm.loop !.*]] ; AVX: middle.block: ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1155,7 +1155,7 @@ ; AVX: for.inc: ; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40 +; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP40:!llvm.loop !.*]] ; AVX: for.end: ; AVX-NEXT: ret void ; @@ -1251,7 +1251,7 @@ ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP39]], <8 x double>* [[TMP51]], i32 8, <8 x i1> [[TMP19]]), !alias.scope !36, !noalias !38 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !39 +; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP39:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1275,7 +1275,7 @@ ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP40:!llvm.loop !.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1384,7 +1384,7 @@ ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; AVX512-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], ; AVX512-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624 -; AVX512-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !49 +; AVX512-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP49:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 625, 624 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1409,7 +1409,7 @@ ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; AVX512-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !50 +; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP50:!llvm.loop !.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1664,7 +1664,7 @@ ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX2-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !49 +; AVX2-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP49:!llvm.loop !.*]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1687,7 +1687,7 @@ ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX2-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 -; AVX2-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !50 +; AVX2-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP50:!llvm.loop !.*]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -1808,7 +1808,7 @@ ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !56, !noalias !58 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX512-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !59 +; AVX512-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP59:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1831,7 +1831,7 @@ ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX512-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 -; AVX512-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !60 +; AVX512-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP60:!llvm.loop !.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1877,93 +1877,36 @@ ; AVX1-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] ; AVX1: for.body.preheader: ; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 -; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 ; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX1: vector.ph: -; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 ; AVX1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX1: vector.body: ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; AVX1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 -; AVX1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 -; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 -; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4 -; AVX1-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 -; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8 -; AVX1-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 -; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12 -; AVX1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1 -; AVX1-NEXT: [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], -; AVX1-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], -; AVX1-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], -; AVX1-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer -; AVX1-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer -; AVX1-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer -; AVX1-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer -; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], -; AVX1-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], -; AVX1-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], -; AVX1-NEXT: [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], -; AVX1-NEXT: [[TMP32:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 0 -; AVX1-NEXT: [[TMP33:%.*]] = bitcast double** [[TMP32]] to <4 x double*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x double*> undef) -; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 4 -; AVX1-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x double*> undef) -; AVX1-NEXT: [[TMP36:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 8 -; AVX1-NEXT: [[TMP37:%.*]] = bitcast double** [[TMP36]] to <4 x double*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x double*> undef) -; AVX1-NEXT: [[TMP38:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 12 -; AVX1-NEXT: [[TMP39:%.*]] = bitcast double** [[TMP38]] to <4 x double*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x double*> undef) -; AVX1-NEXT: [[TMP40:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX1-NEXT: [[TMP41:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX1-NEXT: [[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX1-NEXT: [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX1-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], -; AVX1-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], -; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], -; AVX1-NEXT: [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], -; AVX1-NEXT: [[TMP52:%.*]] = and <4 x i1> [[TMP48]], [[TMP28]] -; AVX1-NEXT: [[TMP53:%.*]] = and <4 x i1> [[TMP49]], [[TMP29]] -; AVX1-NEXT: [[TMP54:%.*]] = and <4 x i1> [[TMP50]], [[TMP30]] -; AVX1-NEXT: [[TMP55:%.*]] = and <4 x i1> [[TMP51]], [[TMP31]] -; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 -; AVX1-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]]) -; AVX1-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 4 -; AVX1-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]]) -; AVX1-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8 -; AVX1-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]]) -; AVX1-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 12 -; AVX1-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) -; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX1-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !41 +; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 +; AVX1-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 +; AVX1-NEXT: [[TMP4:%.*]] = and <4 x i8> [[WIDE_LOAD]], +; AVX1-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP5]], +; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds double*, double** [[TMP6]], i32 0 +; AVX1-NEXT: [[TMP9:%.*]] = bitcast double** [[TMP8]] to <4 x double*>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP9]], i32 8, <4 x i1> [[TMP7]], <4 x double*> undef) +; AVX1-NEXT: [[TMP10:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP10]], +; AVX1-NEXT: [[TMP13:%.*]] = and <4 x i1> [[TMP12]], [[TMP7]] +; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP11]], i32 0 +; AVX1-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>* +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP13]]) +; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; AVX1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP41:!llvm.loop !.*]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -1973,14 +1916,14 @@ ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX1-NEXT: [[TMP66:%.*]] = and i8 [[TMP65]], 1 -; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0 +; AVX1-NEXT: [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX1-NEXT: [[TMP18:%.*]] = and i8 [[TMP17]], 1 +; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP18]], 0 ; AVX1-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX1: land.lhs.true: ; AVX1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP67:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 -; AVX1-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP67]], null +; AVX1-NEXT: [[TMP19:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 +; AVX1-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP19]], null ; AVX1-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX1: if.then: ; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] @@ -1989,7 +1932,7 @@ ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !42 +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP42:!llvm.loop !.*]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: @@ -2087,7 +2030,7 @@ ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51 +; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP51:!llvm.loop !.*]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2113,7 +2056,7 @@ ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !52 +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP52:!llvm.loop !.*]] ; AVX2: for.end.loopexit: ; AVX2-NEXT: br label [[FOR_END]] ; AVX2: for.end: @@ -2211,7 +2154,7 @@ ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !61 +; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP61:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2237,7 +2180,7 @@ ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !62 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP62:!llvm.loop !.*]] ; AVX512: for.end.loopexit: ; AVX512-NEXT: br label [[FOR_END]] ; AVX512: for.end: @@ -2294,93 +2237,36 @@ ; AVX1-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] ; AVX1: for.body.preheader: ; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 -; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 ; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX1: vector.ph: -; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 ; AVX1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX1: vector.body: ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; AVX1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 -; AVX1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 -; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 -; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4 -; AVX1-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 -; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8 -; AVX1-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 -; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12 -; AVX1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1 -; AVX1-NEXT: [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], -; AVX1-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], -; AVX1-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], -; AVX1-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer -; AVX1-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer -; AVX1-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer -; AVX1-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer -; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], -; AVX1-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], -; AVX1-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], -; AVX1-NEXT: [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], -; AVX1-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 0 -; AVX1-NEXT: [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <4 x i32 ()*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x i32 ()*> undef) -; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 4 -; AVX1-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x i32 ()*> undef) -; AVX1-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 8 -; AVX1-NEXT: [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <4 x i32 ()*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x i32 ()*> undef) -; AVX1-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 12 -; AVX1-NEXT: [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <4 x i32 ()*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x i32 ()*> undef) -; AVX1-NEXT: [[TMP40:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX1-NEXT: [[TMP41:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX1-NEXT: [[TMP42:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX1-NEXT: [[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX1-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]] -; AVX1-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], -; AVX1-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], -; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], -; AVX1-NEXT: [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], -; AVX1-NEXT: [[TMP52:%.*]] = and <4 x i1> [[TMP48]], [[TMP28]] -; AVX1-NEXT: [[TMP53:%.*]] = and <4 x i1> [[TMP49]], [[TMP29]] -; AVX1-NEXT: [[TMP54:%.*]] = and <4 x i1> [[TMP50]], [[TMP30]] -; AVX1-NEXT: [[TMP55:%.*]] = and <4 x i1> [[TMP51]], [[TMP31]] -; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 -; AVX1-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]]) -; AVX1-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 4 -; AVX1-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]]) -; AVX1-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8 -; AVX1-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]]) -; AVX1-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 12 -; AVX1-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) -; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX1-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !44 +; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 +; AVX1-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 +; AVX1-NEXT: [[TMP4:%.*]] = and <4 x i8> [[WIDE_LOAD]], +; AVX1-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP5]], +; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP6]], i32 0 +; AVX1-NEXT: [[TMP9:%.*]] = bitcast i32 ()** [[TMP8]] to <4 x i32 ()*>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP9]], i32 8, <4 x i1> [[TMP7]], <4 x i32 ()*> undef) +; AVX1-NEXT: [[TMP10:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP10]], +; AVX1-NEXT: [[TMP13:%.*]] = and <4 x i1> [[TMP12]], [[TMP7]] +; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP11]], i32 0 +; AVX1-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>* +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP13]]) +; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; AVX1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP44:!llvm.loop !.*]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2390,14 +2276,14 @@ ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX1-NEXT: [[TMP66:%.*]] = and i8 [[TMP65]], 1 -; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0 +; AVX1-NEXT: [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX1-NEXT: [[TMP18:%.*]] = and i8 [[TMP17]], 1 +; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP18]], 0 ; AVX1-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX1: land.lhs.true: ; AVX1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP67:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 -; AVX1-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP67]], null +; AVX1-NEXT: [[TMP19:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 +; AVX1-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP19]], null ; AVX1-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX1: if.then: ; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] @@ -2406,7 +2292,7 @@ ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !45 +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP45:!llvm.loop !.*]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: @@ -2504,7 +2390,7 @@ ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54 +; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP54:!llvm.loop !.*]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2530,7 +2416,7 @@ ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !55 +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP55:!llvm.loop !.*]] ; AVX2: for.end.loopexit: ; AVX2-NEXT: br label [[FOR_END]] ; AVX2: for.end: @@ -2628,7 +2514,7 @@ ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !64 +; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP64:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2654,7 +2540,7 @@ ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !65 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP65:!llvm.loop !.*]] ; AVX512: for.end.loopexit: ; AVX512-NEXT: br label [[FOR_END]] ; AVX512: for.end: Index: llvm/unittests/Transforms/Vectorize/VPlanPredicatorTest.cpp =================================================================== --- llvm/unittests/Transforms/Vectorize/VPlanPredicatorTest.cpp +++ llvm/unittests/Transforms/Vectorize/VPlanPredicatorTest.cpp @@ -77,7 +77,7 @@ VPValue *CBV2 = OuterIf->getCondBit(); // Apply predication. - VPlanPredicator VPP(*Plan); + VPlanPredicator VPP(*Plan, F->getContext()); VPP.predicate(); VPBlockBase *InnerLoopLinSucc = InnerLoopH->getSingleSuccessor(); @@ -181,7 +181,7 @@ VPValue *InnerCBV = InnerIfCmpBlk->getCondBit(); // Apply predication. - VPlanPredicator VPP(*Plan); + VPlanPredicator VPP(*Plan, F->getContext()); VPP.predicate(); VPInstruction *And = Index: llvm/unittests/Transforms/Vectorize/VPlanTest.cpp =================================================================== --- llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -26,9 +26,9 @@ } while (0) TEST(VPInstructionTest, insertBefore) { - VPInstruction *I1 = new VPInstruction(0, {}); - VPInstruction *I2 = new VPInstruction(1, {}); - VPInstruction *I3 = new VPInstruction(2, {}); + VPInstruction *I1 = new VPInstruction(0, nullptr, {}); + VPInstruction *I2 = new VPInstruction(1, nullptr, {}); + VPInstruction *I3 = new VPInstruction(2, nullptr, {}); VPBasicBlock VPBB1; VPBB1.appendRecipe(I1); @@ -41,9 +41,9 @@ } TEST(VPInstructionTest, eraseFromParent) { - VPInstruction *I1 = new VPInstruction(0, {}); - VPInstruction *I2 = new VPInstruction(1, {}); - VPInstruction *I3 = new VPInstruction(2, {}); + VPInstruction *I1 = new VPInstruction(0, nullptr, {}); + VPInstruction *I2 = new VPInstruction(1, nullptr, {}); + VPInstruction *I3 = new VPInstruction(2, nullptr, {}); VPBasicBlock VPBB1; VPBB1.appendRecipe(I1); @@ -61,9 +61,9 @@ } TEST(VPInstructionTest, moveAfter) { - VPInstruction *I1 = new VPInstruction(0, {}); - VPInstruction *I2 = new VPInstruction(1, {}); - VPInstruction *I3 = new VPInstruction(2, {}); + VPInstruction *I1 = new VPInstruction(0, nullptr, {}); + VPInstruction *I2 = new VPInstruction(1, nullptr, {}); + VPInstruction *I3 = new VPInstruction(2, nullptr, {}); VPBasicBlock VPBB1; VPBB1.appendRecipe(I1); @@ -74,8 +74,8 @@ CHECK_ITERATOR(VPBB1, I2, I1, I3); - VPInstruction *I4 = new VPInstruction(4, {}); - VPInstruction *I5 = new VPInstruction(5, {}); + VPInstruction *I4 = new VPInstruction(4, nullptr, {}); + VPInstruction *I5 = new VPInstruction(5, nullptr, {}); VPBasicBlock VPBB2; VPBB2.appendRecipe(I4); VPBB2.appendRecipe(I5); @@ -90,7 +90,7 @@ TEST(VPInstructionTest, setOperand) { VPValue *VPV1 = new VPValue(); VPValue *VPV2 = new VPValue(); - VPInstruction *I1 = new VPInstruction(0, {VPV1, VPV2}); + VPInstruction *I1 = new VPInstruction(0, nullptr, {VPV1, VPV2}); EXPECT_EQ(1u, VPV1->getNumUsers()); EXPECT_EQ(I1, *VPV1->user_begin()); EXPECT_EQ(1u, VPV2->getNumUsers()); @@ -136,7 +136,7 @@ TEST(VPInstructionTest, replaceAllUsesWith) { VPValue *VPV1 = new VPValue(); VPValue *VPV2 = new VPValue(); - VPInstruction *I1 = new VPInstruction(0, {VPV1, VPV2}); + VPInstruction *I1 = new VPInstruction(0, nullptr, {VPV1, VPV2}); // Replace all uses of VPV1 with VPV3. VPValue *VPV3 = new VPValue(); @@ -167,7 +167,7 @@ EXPECT_EQ(0u, VPV2->getNumUsers()); EXPECT_EQ(0u, VPV3->getNumUsers()); - VPInstruction *I2 = new VPInstruction(0, {VPV1, VPV2}); + VPInstruction *I2 = new VPInstruction(0, nullptr, {VPV1, VPV2}); EXPECT_EQ(3u, VPV1->getNumUsers()); VPV1->replaceAllUsesWith(VPV3); EXPECT_EQ(3u, VPV3->getNumUsers()); @@ -182,7 +182,7 @@ TEST(VPInstructionTest, releaseOperandsAtDeletion) { VPValue *VPV1 = new VPValue(); VPValue *VPV2 = new VPValue(); - VPInstruction *I1 = new VPInstruction(0, {VPV1, VPV2}); + VPInstruction *I1 = new VPInstruction(0, nullptr, {VPV1, VPV2}); EXPECT_EQ(1u, VPV1->getNumUsers()); EXPECT_EQ(I1, *VPV1->user_begin()); @@ -288,17 +288,17 @@ } TEST(VPBasicBlockTest, print) { - VPInstruction *I1 = new VPInstruction(Instruction::Add, {}); - VPInstruction *I2 = new VPInstruction(Instruction::Sub, {I1}); - VPInstruction *I3 = new VPInstruction(Instruction::Br, {I1, I2}); + VPInstruction *I1 = new VPInstruction(Instruction::Add, nullptr, {}); + VPInstruction *I2 = new VPInstruction(Instruction::Sub, nullptr, {I1}); + VPInstruction *I3 = new VPInstruction(Instruction::Br, nullptr, {I1, I2}); VPBasicBlock *VPBB1 = new VPBasicBlock(); VPBB1->appendRecipe(I1); VPBB1->appendRecipe(I2); VPBB1->appendRecipe(I3); - VPInstruction *I4 = new VPInstruction(Instruction::Mul, {I2, I1}); - VPInstruction *I5 = new VPInstruction(Instruction::Ret, {I4}); + VPInstruction *I4 = new VPInstruction(Instruction::Mul, nullptr, {I2, I1}); + VPInstruction *I5 = new VPInstruction(Instruction::Ret, nullptr, {I4}); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPBB2->appendRecipe(I4); VPBB2->appendRecipe(I5); @@ -360,7 +360,7 @@ TEST(VPRecipeTest, CastVPInstructionToVPUser) { VPValue Op1; VPValue Op2; - VPInstruction Recipe(Instruction::Add, {&Op1, &Op2}); + VPInstruction Recipe(Instruction::Add, nullptr, {&Op1, &Op2}); EXPECT_TRUE(isa(&Recipe)); VPRecipeBase *BaseR = &Recipe; EXPECT_TRUE(isa(BaseR));