diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1835,6 +1835,10 @@ const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); + const bool EnableMemAccessVersioningOfLoop = + EnableMemAccessVersioning && + !TheLoop->getHeader()->getParent()->hasOptSize(); + // For each block. for (BasicBlock *BB : TheLoop->blocks()) { // Scan the BB and collect legal loads and stores. Also detect any @@ -1890,7 +1894,7 @@ NumLoads++; Loads.push_back(Ld); DepChecker->addAccess(Ld); - if (EnableMemAccessVersioning) + if (EnableMemAccessVersioningOfLoop) collectStridedAccess(Ld); continue; } @@ -1914,7 +1918,7 @@ NumStores++; Stores.push_back(St); DepChecker->addAccess(St); - if (EnableMemAccessVersioning) + if (EnableMemAccessVersioningOfLoop) collectStridedAccess(St); } } // Next instr. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4937,15 +4937,8 @@ return true; } - // FIXME: Avoid specializing for stride==1 instead of bailing out. - if (!Legal->getLAI()->getSymbolicStrides().empty()) { - reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", - "runtime stride == 1 checks needed. Enable vectorization of " - "this loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz", - "CantVersionLoopWithOptForSize", ORE, TheLoop); - return true; - } + assert(Legal->getLAI()->getSymbolicStrides().empty() && + "Specializing for stride == 1 under -Os/-Oz"); return false; } @@ -7611,7 +7604,7 @@ PGSOQueryType::IRPass); // 1) OptSize takes precedence over all other options, i.e. if this is set, // don't look at hints or options, and don't request a scalar epilogue. - if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) + if (OptSize) return CM_ScalarEpilogueNotAllowedOptSize; bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll --- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll @@ -218,38 +218,35 @@ attributes #1 = { minsize } -; We can't vectorize this one because we version for stride==1; even having TC -; a multiple of VF. +; We can vectorize this one by refraining from versioning for stride==1. define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 { ; CHECK-LABEL: @scev4stride1( ; CHECK-NEXT: for.body.preheader: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K:%.*]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[MUL]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_07]] -; CHECK-NEXT: store i32 [[TMP0]], i32* [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[K:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 256, 256 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: ; CHECK: for.end.loopexit: ; CHECK-NEXT: ret void ; ; AUTOVF-LABEL: @scev4stride1( ; AUTOVF-NEXT: for.body.preheader: -; AUTOVF-NEXT: br label [[FOR_BODY:%.*]] -; AUTOVF: for.body: -; AUTOVF-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] -; AUTOVF-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K:%.*]] -; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[MUL]] -; AUTOVF-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AUTOVF-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_07]] -; AUTOVF-NEXT: store i32 [[TMP0]], i32* [[ARRAYIDX1]], align 4 -; AUTOVF-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1 -; AUTOVF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256 -; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]] +; AUTOVF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; AUTOVF: vector.ph: +; AUTOVF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[K:%.*]], i32 0 +; AUTOVF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; AUTOVF-NEXT: br label [[VECTOR_BODY:%.*]] +; AUTOVF: vector.body: +; AUTOVF: middle.block: +; AUTOVF-NEXT: [[CMP_N:%.*]] = icmp eq i32 256, 256 +; AUTOVF-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; AUTOVF: scalar.ph: ; AUTOVF: for.end.loopexit: ; AUTOVF-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll --- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -loop-vectorize -S | FileCheck %s --check-prefixes=CHECK,DEFAULT -; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s --check-prefixes=CHECK,PREDFLAG +; RUN: opt < %s -loop-vectorize -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -74,141 +74,56 @@ br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !6 } +; Marking function as optsize turns tail folding on, as if explicit tail folding +; flag was enabled. define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 { -; DEFAULT-LABEL: @tail_folding_disabled( -; DEFAULT-NEXT: entry: -; DEFAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; DEFAULT: vector.ph: -; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] -; DEFAULT: vector.body: -; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; DEFAULT-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 -; DEFAULT-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 -; DEFAULT-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 -; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] -; DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] -; DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]] -; DEFAULT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP3]] -; DEFAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; DEFAULT-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* -; DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4 -; DEFAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 -; DEFAULT-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; DEFAULT-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4 -; DEFAULT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 -; DEFAULT-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; DEFAULT-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4 -; DEFAULT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24 -; DEFAULT-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; DEFAULT-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4 -; DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]] -; DEFAULT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP1]] -; DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP2]] -; DEFAULT-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP3]] -; DEFAULT-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0 -; DEFAULT-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <8 x i32>* -; DEFAULT-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i32>, <8 x i32>* [[TMP21]], align 4 -; DEFAULT-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 8 -; DEFAULT-NEXT: [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <8 x i32>* -; DEFAULT-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i32>, <8 x i32>* [[TMP23]], align 4 -; DEFAULT-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 16 -; DEFAULT-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <8 x i32>* -; DEFAULT-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, <8 x i32>* [[TMP25]], align 4 -; DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 24 -; DEFAULT-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>* -; DEFAULT-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, <8 x i32>* [[TMP27]], align 4 -; DEFAULT-NEXT: [[TMP28:%.*]] = add nsw <8 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD]] -; DEFAULT-NEXT: [[TMP29:%.*]] = add nsw <8 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD1]] -; DEFAULT-NEXT: [[TMP30:%.*]] = add nsw <8 x i32> [[WIDE_LOAD6]], [[WIDE_LOAD2]] -; DEFAULT-NEXT: [[TMP31:%.*]] = add nsw <8 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD3]] -; DEFAULT-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; DEFAULT-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] -; DEFAULT-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]] -; DEFAULT-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]] -; DEFAULT-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 0 -; DEFAULT-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <8 x i32>* -; DEFAULT-NEXT: store <8 x i32> [[TMP28]], <8 x i32>* [[TMP37]], align 4 -; DEFAULT-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 8 -; DEFAULT-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <8 x i32>* -; DEFAULT-NEXT: store <8 x i32> [[TMP29]], <8 x i32>* [[TMP39]], align 4 -; DEFAULT-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 16 -; DEFAULT-NEXT: [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <8 x i32>* -; DEFAULT-NEXT: store <8 x i32> [[TMP30]], <8 x i32>* [[TMP41]], align 4 -; DEFAULT-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 24 -; DEFAULT-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <8 x i32>* -; DEFAULT-NEXT: store <8 x i32> [[TMP31]], <8 x i32>* [[TMP43]], align 4 -; DEFAULT-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 -; DEFAULT-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 416 -; DEFAULT-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 -; DEFAULT: middle.block: -; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 430, 416 -; DEFAULT-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; DEFAULT: scalar.ph: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 416, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; DEFAULT-NEXT: br label [[FOR_BODY:%.*]] -; DEFAULT: for.cond.cleanup: -; DEFAULT-NEXT: ret void -; DEFAULT: for.body: -; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; DEFAULT-NEXT: [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; DEFAULT-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]] -; DEFAULT-NEXT: [[TMP46:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 -; DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP46]], [[TMP45]] -; DEFAULT-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; DEFAULT-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 -; DEFAULT-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; DEFAULT-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430 -; DEFAULT-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !5 -; -; PREDFLAG-LABEL: @tail_folding_disabled( -; PREDFLAG-NEXT: entry: -; PREDFLAG-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; PREDFLAG: vector.ph: -; PREDFLAG-NEXT: br label [[VECTOR_BODY:%.*]] -; PREDFLAG: vector.body: -; PREDFLAG-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDFLAG-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 -; PREDFLAG-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer -; PREDFLAG-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; PREDFLAG-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; PREDFLAG-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] -; PREDFLAG-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[INDUCTION]], -; PREDFLAG-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; PREDFLAG-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>* -; PREDFLAG-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef) -; PREDFLAG-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]] -; PREDFLAG-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 -; PREDFLAG-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* -; PREDFLAG-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef) -; PREDFLAG-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] -; PREDFLAG-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; PREDFLAG-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 -; PREDFLAG-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; PREDFLAG-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP2]]) -; PREDFLAG-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 -; PREDFLAG-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432 -; PREDFLAG-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 -; PREDFLAG: middle.block: -; PREDFLAG-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; PREDFLAG: scalar.ph: -; PREDFLAG-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 432, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; PREDFLAG-NEXT: br label [[FOR_BODY:%.*]] -; PREDFLAG: for.cond.cleanup: -; PREDFLAG-NEXT: ret void -; PREDFLAG: for.body: -; PREDFLAG-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; PREDFLAG-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; PREDFLAG-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; PREDFLAG-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]] -; PREDFLAG-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 -; PREDFLAG-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]] -; PREDFLAG-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; PREDFLAG-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 -; PREDFLAG-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; PREDFLAG-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430 -; PREDFLAG-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !5 +; CHECK-LABEL: @tail_folding_disabled( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[INDUCTION]], +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef) +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432 +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 432, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !5 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll --- a/llvm/test/Transforms/LoopVectorize/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/optsize.ll @@ -154,6 +154,73 @@ ret i32 %for } +; PR46228: Vectorize w/o versioning for unit stride under optsize and enabled +; vectorization. + +; NOTE: Some assertions have been autogenerated by utils/update_test_checks.py +define void @stride1(i16* noalias %B, i32 %BStride) optsize { +; CHECK-LABEL: @stride1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[BSTRIDE:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ] +; CHECK-NEXT: [[TMP0:%.*]] = mul nsw <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <2 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 +; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[TMP3]] +; CHECK-NEXT: store i16 42, i16* [[TMP4]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 +; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] +; CHECK: pred.store.if1: +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP6]] +; CHECK-NEXT: store i16 42, i16* [[TMP7]], align 4 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] +; CHECK: pred.store.continue2: +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19 +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK: for.end: +; CHECK-NEXT: ret void +; +; PGSO-LABEL: @stride1( +; PGSO-NEXT: entry: +; PGSO-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; +; NPGSO-LABEL: @stride1( +; NPGSO-NEXT: entry: +; NPGSO-NEXT: br i1 false, label %scalar.ph, label %vector.ph + +entry: + br label %for.body + +for.body: + %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ] + %mulB = mul nsw i32 %iv, %BStride + %gepOfB = getelementptr inbounds i16, i16* %B, i32 %mulB + store i16 42, i16* %gepOfB, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, 1025 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !15 + +for.end: + ret void +} + !llvm.module.flags = !{!0} !0 = !{i32 1, !"ProfileSummary", !1} !1 = !{!2, !3, !4, !5, !6, !7, !8, !9} @@ -170,3 +237,5 @@ !12 = !{i32 999000, i64 100, i32 1} !13 = !{i32 999999, i64 1, i32 2} !14 = !{!"function_entry_count", i64 0} +!15 = distinct !{!15, !16} +!16 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll --- a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll +++ b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll @@ -26,13 +26,57 @@ ret void } -; Check that the need for stride==1 check prevents vectorizing a loop under opt -; for size. -; CHECK-LABEL: @scev4stride1 -; CHECK-NOT: vector.scevcheck -; CHECK-NOT: vector.body: -; CHECK-LABEL: for.body: +; Check that a loop under opt-for-size is vectorized, w/o checking for +; stride==1. +; NOTE: Some assertions have been autogenerated by utils/update_test_checks.py define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #0 { +; CHECK-LABEL: @scev4stride1( +; CHECK-NEXT: for.body.preheader: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[K:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> undef, i32 [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP15]], i32 2 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP16]], i32 3 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP23]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1024, 1024 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK: for.body: +; CHECK: for.end.loopexit: +; CHECK-NEXT: ret void +; for.body.preheader: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll --- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll @@ -162,9 +162,9 @@ define dso_local void @forced_optsize(i64* noalias nocapture readonly %x_p, i64* noalias nocapture readonly %y_p, i64* noalias nocapture %z_p) minsize optsize { ; -; FORCED_OPTSIZE: remark: :0:0: Code-size may be reduced by not forcing vectorization, or by source-code modifications eliminating the need for runtime checks (e.g., adding 'restrict'). +; FORCED_OPTSIZE: remark: :0:0: loop not vectorized: runtime pointer checks needed. Enable vectorization of this loop with '#pragma clang loop vectorize(enable)' when compiling with -Os/-Oz ; FORCED_OPTSIZE-LABEL: @forced_optsize( -; FORCED_OPTSIZE: vector.body: +; FORCED_OPTSIZE-NOT: vector.body: ; entry: br label %for.body