Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5574,9 +5574,12 @@ // consider interleaving beneficial (eg. MVE). if (TTI.getMaxInterleaveFactor(VF) <= 1) return false; - // FIXME: We should consider changing the threshold for scalable - // vectors to take VScaleForTuning into account. - if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) + + unsigned Multiplier = 1; + if (VF.isScalable()) + if (std::optional VScale = getVScaleForTuning()) + Multiplier = *VScale; + if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) return true; return false; } Index: llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll @@ -1,5 +1,7 @@ -; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 -S %s | FileCheck --check-prefixes=CHECK,DEFAULT %s -; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefixes=CHECK,THRESHOLD %s +; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 \ +; RUN: -enable-epilogue-vectorization=false -S %s | FileCheck --check-prefixes=CHECK,DEFAULT %s +; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 \ +; RUN: -enable-epilogue-vectorization=false -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefixes=CHECK,THRESHOLD %s ; Tests for loops with large numbers of runtime checks. Check that loops are ; vectorized, if the loop trip counts are large and the impact of the runtime Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll @@ -13,70 +13,106 @@ ; CHECK-NEXT: [[S2:%.*]] = ptrtoint ptr [[S:%.*]] to i64 ; CHECK-NEXT: [[D1:%.*]] = ptrtoint ptr [[D:%.*]] to i64 ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: +; CHECK-NEXT: br i1 [[CMP6]], label [[ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: iter.check: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[D1]], [[S2]] ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]] -; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 16 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP7]] +; CHECK-NEXT: [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP7]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP9]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds half, ptr [[S]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds half, ptr [[TMP9]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 2 -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds half, ptr [[TMP9]], i64 [[TMP12]] -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 2 -; CHECK-NEXT: [[TMP14:%.*]] = fneg [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP15:%.*]] = fneg [[WIDE_LOAD3]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds half, ptr [[D]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds half, ptr [[TMP16]], i32 0 -; CHECK-NEXT: store [[TMP14]], ptr [[TMP17]], align 2 -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds half, ptr [[TMP16]], i64 [[TMP19]] -; CHECK-NEXT: store [[TMP15]], ptr [[TMP20]], align 2 -; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 16 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds half, ptr [[S]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds half, ptr [[TMP11]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 2 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds half, ptr [[TMP11]], i64 [[TMP14]] +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP15]], align 2 +; CHECK-NEXT: [[TMP16:%.*]] = fneg [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP17:%.*]] = fneg [[WIDE_LOAD4]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds half, ptr [[D]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds half, ptr [[TMP18]], i32 0 +; CHECK-NEXT: store [[TMP16]], ptr [[TMP19]], align 2 +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 8 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds half, ptr [[TMP18]], i64 [[TMP21]] +; CHECK-NEXT: store [[TMP17]], ptr [[TMP22]], align 2 +; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP24]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP27]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 4 +; CHECK-NEXT: [[N_MOD_VF5:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP29]] +; CHECK-NEXT: [[N_VEC6:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF5]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[INDEX8]], 0 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds half, ptr [[S]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds half, ptr [[TMP31]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP32]], align 2 +; CHECK-NEXT: [[TMP33:%.*]] = fneg [[WIDE_LOAD9]] +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds half, ptr [[D]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds half, ptr [[TMP34]], i32 0 +; CHECK-NEXT: store [[TMP33]], ptr [[TMP35]], align 2 +; CHECK-NEXT: [[TMP36:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], 4 +; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX8]], [[TMP37]] +; CHECK-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC6]] +; CHECK-NEXT: br i1 [[TMP38]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC6]] +; CHECK-NEXT: br i1 [[CMP_N7]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr [[S]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP24:%.*]] = load half, ptr [[ARRAYIDX]], align 2 -; CHECK-NEXT: [[FNEG:%.*]] = fneg half [[TMP24]] +; CHECK-NEXT: [[TMP39:%.*]] = load half, ptr [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[FNEG:%.*]] = fneg half [[TMP39]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr [[D]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store half [[FNEG]], ptr [[ARRAYIDX2]], align 2 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; entry: %cmp6 = icmp sgt i32 %n, 0 Index: llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll @@ -19,44 +19,77 @@ ; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 8 For instruction: %conv = zext i8 %0 to i32 ; CHECK-LABEL: define void @zext_i8_i16 ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP2]], [[TMP6]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP8]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-NEXT: [[TMP9:%.*]] = add [[TMP8]], trunc ( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) to ) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP9]], ptr [[TMP10]], align 2 -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP10]], trunc ( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) to ) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]] +; CHECK-NEXT: store [[TMP11]], ptr [[TMP12]], align 2 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP17]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP2]], [[TMP19]] +; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF2]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX5]] +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP21:%.*]] = zext [[WIDE_LOAD6]] to +; CHECK-NEXT: [[TMP22:%.*]] = add [[TMP21]], trunc ( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) to ) +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX5]] +; CHECK-NEXT: store [[TMP22]], ptr [[TMP23]], align 2 +; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 4 +; CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX5]], [[TMP25]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP14]] to i32 +; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP27]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV]], 2 ; CHECK-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDVARS_IV]] @@ -64,7 +97,7 @@ ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[LEN]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -102,44 +135,77 @@ ; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 8 For instruction: %conv = sext i8 %0 to i32 ; CHECK-LABEL: define void @sext_i8_i16 ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP2]], [[TMP6]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP8]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-NEXT: [[TMP9:%.*]] = add [[TMP8]], trunc ( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) to ) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP9]], ptr [[TMP10]], align 2 -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP10]], trunc ( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) to ) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]] +; CHECK-NEXT: store [[TMP11]], ptr [[TMP12]], align 2 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP17]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP2]], [[TMP19]] +; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF2]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX5]] +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP21:%.*]] = sext [[WIDE_LOAD6]] to +; CHECK-NEXT: [[TMP22:%.*]] = add [[TMP21]], trunc ( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) to ) +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX5]] +; CHECK-NEXT: store [[TMP22]], ptr [[TMP23]], align 2 +; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 4 +; CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX5]], [[TMP25]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP14]] to i32 +; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP27]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV]], 2 ; CHECK-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDVARS_IV]] @@ -147,7 +213,7 @@ ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[LEN]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ;