Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1912,14 +1912,15 @@ /// there is no vector code generation, the check blocks are removed /// completely. void Create(Loop *L, const LoopAccessInfo &LAI, - const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { + const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC, + bool ForceVectorization) { // Hard cutoff to limit compile-time increase in case a very large number of - // runtime checks needs to be generated. + // runtime checks needs to be generated, unless user forced vectorization. // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to // profile info. - CostTooHigh = - LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; + CostTooHigh = !ForceVectorization && LAI.getNumRuntimePointerChecks() > + VectorizeMemoryCheckThreshold; if (CostTooHigh) return; @@ -10333,14 +10334,15 @@ IC = CM.selectInterleaveCount(VF.Width, VF.Cost); unsigned SelectedIC = std::max(IC, UserIC); + bool ForceVectorization = + Hints.getForce() == LoopVectorizeHints::FK_Enabled; // Optimistically generate runtime checks if they are needed. Drop them if // they turn out to not be profitable. if (VF.Width.isVector() || SelectedIC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); + Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC, + ForceVectorization); // Check if it is profitable to vectorize with runtime checks. - bool ForceVectorization = - Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (!ForceVectorization && !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L, *PSE.getSE())) { Index: llvm/test/Transforms/LoopVectorize/memcheck_threashold_vec_forced.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/memcheck_threashold_vec_forced.ll +++ llvm/test/Transforms/LoopVectorize/memcheck_threashold_vec_forced.ll @@ -9,38 +9,44 @@ define void @vec_forced(i64 %n, ptr %a, ptr %b) { ; CHECK-LABEL: @vec_forced( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; CHECK-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[B2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 -; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] -; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP7]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], 1 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1