Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -229,6 +229,7 @@ /// Return true if we can vectorize this loop while folding its tail by /// masking, and mark all respective loads/stores for masking. + /// This object's state is only modified iff this function returns true. bool prepareToFoldTailByMasking(); /// Returns the primary induction variable. @@ -369,8 +370,14 @@ /// its original trip-count, under a proper guard, which should be preserved. /// \p SafePtrs is a list of addresses that are known to be legal and we know /// that we can read from them without segfault. + /// \p MaskedOp is a list of instructions that have to be transformed into + /// calls to the appropriate masked intrinsic when the loop is vectorized. + /// \p ConditionalAssumes is a list of assume instructions in predicated + /// blocks that must be dropped if the CFG gets flattened. bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl &SafePtrs, - bool PreserveGuards = false); + SmallPtrSetImpl &MaskedOp, + SmallPtrSetImpl &ConditionalAssumes, + bool PreserveGuards = false) const; /// Updates the vectorization state by adding \p Phi to the inductions list. /// This can set \p Phi as the main induction of the loop if \p Phi is a Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -912,7 +912,10 @@ } bool LoopVectorizationLegality::blockCanBePredicated( - BasicBlock *BB, SmallPtrSetImpl &SafePtrs, bool PreserveGuards) { + BasicBlock *BB, SmallPtrSetImpl &SafePtrs, + SmallPtrSetImpl &MaskedOp, + SmallPtrSetImpl &ConditionalAssumes, + bool PreserveGuards) const { const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); for (Instruction &I : *BB) { @@ -1019,7 +1022,8 @@ // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB)) { - if (!blockCanBePredicated(BB, SafePointers)) { + if (!blockCanBePredicated(BB, SafePointers, MaskedOp, + ConditionalAssumes)) { reportVectorizationFailure( "Control flow cannot be substituted for a select", "control flow cannot be substituted for a select", @@ -1246,10 +1250,10 @@ Instruction *UI = cast(U); if (TheLoop->contains(UI)) continue; - reportVectorizationFailure( - "Cannot fold tail by masking, loop has an outside user for", - "Cannot fold tail by masking in the presence of live outs.", - "LiveOutFoldingTailByMasking", ORE, TheLoop, UI); + LLVM_DEBUG( + dbgs() + << "LV: Cannot fold tail by masking, loop has an outside user for " + << *UI << "\n"); return false; } } @@ -1257,20 +1261,26 @@ // The list of pointers that we can safely read and write to remains empty. SmallPtrSet SafePointers; + SmallPtrSet TmpMaskedOp; + SmallPtrSet TmpConditionalAssumes; + // Check and mark all blocks for predication, including those that ordinarily // do not need predication such as the header block. for (BasicBlock *BB : TheLoop->blocks()) { - if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) { - reportVectorizationFailure( - "Cannot fold tail by masking as required", - "control flow cannot be substituted for a select", - "NoCFGForSelect", ORE, TheLoop, - BB->getTerminator()); + if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp, + TmpConditionalAssumes, + /* MaskAllLoads= */ true)) { + LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n"); return false; } } LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); + + MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end()); + ConditionalAssumes.insert(TmpConditionalAssumes.begin(), + TmpConditionalAssumes.end()); + return true; } Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -178,13 +178,36 @@ "value are vectorized only if no scalar iteration overheads " "are incurred.")); -// Indicates that an epilogue is undesired, predication is preferred. -// This means that the vectorizer will try to fold the loop-tail (epilogue) -// into the loop and predicate the loop body accordingly. -static cl::opt PreferPredicateOverEpilog( - "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, - cl::desc("Indicate that an epilogue is undesired, predication should be " - "used instead.")); +// Option prefer-predicate-over-epilog indicates that an epilogue is undesired, +// that predication is preferred, and this lists all options. I.e., the +// vectorizer will try to fold the tail-loop (epilogue) into the vector body +// and predicate the instructions accordingly. If tail-folding fails, there are +// different fallback strategies depending on these values: +namespace PreferPredicateTy { + enum Option { + ScalarEpilogue = 0, + PredicateElseScalarEpilogue, + PredicateOrDontVectorize + }; +} + +static cl::opt PreferPredicateOverEpilogue( + "prefer-predicate-over-epilog", + cl::init(PreferPredicateTy::ScalarEpilogue), + cl::Hidden, + cl::desc("Tail-folding and predication preferences over creating a scalar " + "epilogue loop."), + cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, + "scalar-epilogue", + "Don't tail-predicate loops, create scalar epilogue"), + clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, + "predicate-else-scalar-epilogue", + "prefer tail-folding, create scalar epilogue if tail " + "folding fails."), + clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, + "predicate-dont-vectorize", + "prefers tail-folding, don't attempt vectorization if " + "tail-folding fails."))); static cl::opt MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, @@ -196,7 +219,7 @@ cl::desc("Enable vectorization on interleaved memory accesses in a loop")); /// An interleave-group may need masking if it resides in a block that needs -/// predication, or in order to mask away gaps. +/// predication, or in order to mask away gaps. static cl::opt EnableMaskedInterleavedMemAccesses( "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); @@ -5076,6 +5099,19 @@ return MaxVF; } + // If there was a tail-folding hint/switch, but we can't fold the tail by + // masking, fallback to a vectorization with a scalar epilogue. + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { + if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) { + LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); + return None; + } + LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " + "scalar epilogue instead.\n"); + ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; + return MaxVF; + } + if (TC == 0) { reportVectorizationFailure( "Unable to calculate the loop count due to complex control flow", @@ -7663,8 +7699,8 @@ if (OptSize) return CM_ScalarEpilogueNotAllowedOptSize; - bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && - !PreferPredicateOverEpilog; + bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() && + !PreferPredicateOverEpilogue; // 2) Next, if disabling predication is requested on the command line, honour // this and request a scalar epilogue. @@ -7673,8 +7709,8 @@ // 3) and 4) look if enabling predication is requested on the command line, // with a loop hint, or if the TTI hook indicates this is profitable, request - // predication . - if (PreferPredicateOverEpilog || + // predication. + if (PreferPredicateOverEpilogue || Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, LVL.getLAI()) && Index: llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll +++ llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll @@ -31,13 +31,13 @@ ; RUN: FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING ; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \ -; RUN: -prefer-predicate-over-epilog=false \ +; RUN: -prefer-predicate-over-epilog=scalar-epilogue \ ; RUN: -tail-predication=enabled -loop-vectorize \ ; RUN: -enable-arm-maskedldst=true -S < %s | \ ; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING ; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \ -; RUN: -prefer-predicate-over-epilog=true \ +; RUN: -prefer-predicate-over-epilog=predicate-dont-vectorize \ ; RUN: -tail-predication=enabled -loop-vectorize \ ; RUN: -enable-arm-maskedldst=true -S < %s | \ ; RUN: FileCheck %s -check-prefixes=CHECK,FOLDING-OPT Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll @@ -1,5 +1,6 @@ ; RUN: opt < %s -loop-vectorize -S | FileCheck %s --check-prefixes=COMMON,DEFAULT -; RUN: opt < %s -loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilog -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-PREFER +; RUN: opt < %s -loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilog=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-PREFER +; RUN: opt < %s -loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilog=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-PREFER ; RUN: opt < %s -loop-vectorize -tail-predication=enabled -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-ENABLE-TP target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll @@ -1,8 +1,9 @@ ; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp -loop-vectorize -tail-predication=enabled -S < %s | \ ; RUN: FileCheck %s -check-prefix=CHECK -; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp -loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilog -S < %s | \ -; RUN: FileCheck -check-prefix=PREDFLAG %s +; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp -loop-vectorize -tail-predication=enabled \ +; RUN: -prefer-predicate-over-epilog=predicate-dont-vectorize -S < %s | \ +; RUN: FileCheck -check-prefix=PREDFLAG %s ; This test has a loop hint "predicate.predicate" set to false, so shouldn't ; get tail-folded, except with -prefer-predicate-over-epilog which then Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -loop-vectorize -mattr=+armv8.1-m.main,+mve.fp -tail-predication=disabled< %s | FileCheck %s +; RUN: opt -S -loop-vectorize -mattr=+armv8.1-m.main,+mve.fp -tail-predication=enabled < %s | FileCheck %s + +; This test should produce the same result (vectorized loop + scalar epilogue) with +; default options and when MVE Tail Predication is enabled, as this loop's tail cannot be folded +; by masking due to an outside user of %incdec.ptr in %end. + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv8.1m.main-arm-unknown-eabihf" + +define void @outside_user_blocks_tail_folding(i8* nocapture readonly %ptr, i32 %size, i8** %pos) { +; CHECK-LABEL: @outside_user_blocks_tail_folding( +; CHECK-NEXT: header: +; CHECK-NEXT: [[PTR0:%.*]] = load i8*, i8** [[POS:%.*]], align 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE:%.*]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[SIZE]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[SIZE]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[SIZE]], [[N_VEC]] +; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i32 [[N_VEC]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[SIZE]], [[INDEX]] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[WIDE_LOAD]], <16 x i8>* [[TMP6]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i8* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ] +; CHECK-NEXT: br label [[BODY:%.*]] +; CHECK: body: +; CHECK-NEXT: [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[BUFF:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF]], i32 1 +; CHECK-NEXT: [[DEC]] = add nsw i32 [[DEC66]], -1 +; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1 +; CHECK-NEXT: store i8 [[TMP8]], i8* [[BUFF]], align 1 +; CHECK-NEXT: [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0 +; CHECK-NEXT: br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop !2 +; CHECK: end: +; CHECK-NEXT: [[INCDEC_PTR_LCSSA:%.*]] = phi i8* [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END2]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: store i8* [[INCDEC_PTR_LCSSA]], i8** [[POS]], align 4 +; CHECK-NEXT: ret void +; +header: + %ptr0 = load i8*, i8** %pos, align 4 + br label %body + +body: + %dec66 = phi i32 [ %dec, %body ], [ %size, %header ] + %buff = phi i8* [ %incdec.ptr, %body ], [ %ptr, %header ] + %incdec.ptr = getelementptr inbounds i8, i8* %buff, i32 1 + %dec = add nsw i32 %dec66, -1 + %0 = load i8, i8* %incdec.ptr, align 1 + store i8 %0, i8* %buff, align 1 + %tobool11 = icmp eq i32 %dec, 0 + br i1 %tobool11, label %end, label %body + +end: + store i8* %incdec.ptr, i8** %pos, align 4 + ret void +} Index: llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll +++ llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -loop-vectorize -S | FileCheck %s -; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog=predicate-dont-vectorize -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" Index: llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll +++ llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -loop-vectorize -vectorize-num-stores-pred=2 -prefer-predicate-over-epilog -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -vectorize-num-stores-pred=2 -prefer-predicate-over-epilog=predicate-dont-vectorize -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" Index: llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll +++ llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -loop-vectorize -force-vector-width=2 -S -prefer-predicate-over-epilog %s | FileCheck %s +; RUN: opt -loop-vectorize -force-vector-width=2 -S -prefer-predicate-over-epilog=predicate-dont-vectorize %s | FileCheck %s ; Test case for PR46525. There are two candidates to pick for Index: llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll +++ llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -force-vector-width=4 -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog=predicate-dont-vectorize -force-vector-width=4 -S | FileCheck %s ; Check that a counting-down loop which has no primary induction variable ; is vectorized with preferred predication. Index: llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll @@ -0,0 +1,153 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -loop-vectorize -prefer-predicate-over-epilog=predicate-else-scalar-epilogue < %s | FileCheck %s +; RUN: opt -S -loop-vectorize < %s | FileCheck %s + +; This tests should produce the same result as with default options, and when tail folding +; is preferred, because the vectorizer can't fold the tail by masking (due to an +; outside user of %incdec.ptr in %end) and should fallback to a scalar epilogue. +; +; The first test (@basic_loop) simply relies on the command-line switches. +; The second test (@metadata) specificies its tail-folding preference via metadata. +; Both tests should always generate a scalar epilogue. + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + +define void @basic_loop(i8* nocapture readonly %ptr, i32 %size, i8** %pos) { +; CHECK-LABEL: @basic_loop( +; CHECK-NEXT: header: +; CHECK-NEXT: [[PTR0:%.*]] = load i8*, i8** [[POS:%.*]], align 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[SIZE]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[SIZE]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[SIZE]], [[N_VEC]] +; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i32 [[N_VEC]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[SIZE]], [[INDEX]] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <4 x i8>* +; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP6]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i8* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ] +; CHECK-NEXT: br label [[BODY:%.*]] +; CHECK: body: +; CHECK-NEXT: [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[BUFF:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF]], i32 1 +; CHECK-NEXT: [[DEC]] = add nsw i32 [[DEC66]], -1 +; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1 +; CHECK-NEXT: store i8 [[TMP8]], i8* [[BUFF]], align 1 +; CHECK-NEXT: [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0 +; CHECK-NEXT: br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop !2 +; CHECK: end: +; CHECK-NEXT: [[INCDEC_PTR_LCSSA:%.*]] = phi i8* [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END2]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: store i8* [[INCDEC_PTR_LCSSA]], i8** [[POS]], align 4 +; CHECK-NEXT: ret void +; +header: + %ptr0 = load i8*, i8** %pos, align 4 + br label %body + +body: + %dec66 = phi i32 [ %dec, %body ], [ %size, %header ] + %buff = phi i8* [ %incdec.ptr, %body ], [ %ptr, %header ] + %incdec.ptr = getelementptr inbounds i8, i8* %buff, i32 1 + %dec = add nsw i32 %dec66, -1 + %0 = load i8, i8* %incdec.ptr, align 1 + store i8 %0, i8* %buff, align 1 + %tobool11 = icmp eq i32 %dec, 0 + br i1 %tobool11, label %end, label %body + +end: + store i8* %incdec.ptr, i8** %pos, align 4 + ret void +} + +define void @metadata(i8* nocapture readonly %ptr, i32 %size, i8** %pos) { +; CHECK-LABEL: @metadata( +; CHECK-NEXT: header: +; CHECK-NEXT: [[PTR0:%.*]] = load i8*, i8** [[POS:%.*]], align 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[SIZE]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[SIZE]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[SIZE]], [[N_VEC]] +; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i32 [[N_VEC]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[SIZE]], [[INDEX]] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <4 x i8>* +; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP6]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i8* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ] +; CHECK-NEXT: br label [[BODY:%.*]] +; CHECK: body: +; CHECK-NEXT: [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[BUFF:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF]], i32 1 +; CHECK-NEXT: [[DEC]] = add nsw i32 [[DEC66]], -1 +; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1 +; CHECK-NEXT: store i8 [[TMP8]], i8* [[BUFF]], align 1 +; CHECK-NEXT: [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0 +; CHECK-NEXT: br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop !5 +; CHECK: end: +; CHECK-NEXT: [[INCDEC_PTR_LCSSA:%.*]] = phi i8* [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END2]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: store i8* [[INCDEC_PTR_LCSSA]], i8** [[POS]], align 4 +; CHECK-NEXT: ret void +; +header: + %ptr0 = load i8*, i8** %pos, align 4 + br label %body + +body: + %dec66 = phi i32 [ %dec, %body ], [ %size, %header ] + %buff = phi i8* [ %incdec.ptr, %body ], [ %ptr, %header ] + %incdec.ptr = getelementptr inbounds i8, i8* %buff, i32 1 + %dec = add nsw i32 %dec66, -1 + %0 = load i8, i8* %incdec.ptr, align 1 + store i8 %0, i8* %buff, align 1 + %tobool11 = icmp eq i32 %dec, 0 + br i1 %tobool11, label %end, label %body, !llvm.loop !1 + +end: + store i8* %incdec.ptr, i8** %pos, align 4 + ret void +} + +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} +!3 = !{!"llvm.loop.vectorize.enable", i1 true}