Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -229,7 +229,16 @@ /// Return true if we can vectorize this loop while folding its tail by /// masking, and mark all respective loads/stores for masking. - bool prepareToFoldTailByMasking(); + /// If \p ReportFailure is false, then failures won't be considered as + /// vectorization failures. + bool prepareToFoldTailByMasking(bool ReportFailure = true); + + /// Abandons tail-folding by masking: clears the sets of masked operations and + /// conditional assumes. + void abandonTailFoldingByMasking() { + MaskedOp.clear(); + ConditionalAssumes.clear(); + } /// Returns the primary induction variable. PHINode *getPrimaryInduction() { return PrimaryInduction; } Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1230,7 +1230,7 @@ return Result; } -bool LoopVectorizationLegality::prepareToFoldTailByMasking() { +bool LoopVectorizationLegality::prepareToFoldTailByMasking(bool ReportFailure) { LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n"); @@ -1249,10 +1249,17 @@ Instruction *UI = cast<Instruction>(U); if (TheLoop->contains(UI)) continue; - reportVectorizationFailure( - "Cannot fold tail by masking, loop has an outside user for", - "Cannot fold tail by masking in the presence of live outs.", - "LiveOutFoldingTailByMasking", ORE, TheLoop, UI); + if (ReportFailure) { + reportVectorizationFailure( + "Cannot fold tail by masking, loop has an outside user for", + "Cannot fold tail by masking in the presence of live outs.", + "LiveOutFoldingTailByMasking", ORE, TheLoop, UI); + } else { + LLVM_DEBUG( + dbgs() + << "LV: Cannot fold tail by masking, loop has an outside user for " + << *UI << "\n"); + } return false; } } @@ -1264,11 +1271,14 @@ // do not need predication such as the header block. for (BasicBlock *BB : TheLoop->blocks()) { if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) { - reportVectorizationFailure( - "Cannot fold tail by masking as required", - "control flow cannot be substituted for a select", - "NoCFGForSelect", ORE, TheLoop, - BB->getTerminator()); + if (ReportFailure) { + reportVectorizationFailure( + "Cannot fold tail by masking as required", + "control flow cannot be substituted for a select", "NoCFGForSelect", + ORE, TheLoop, BB->getTerminator()); + } else { + LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking\n"); + } return false; } } Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4966,13 +4966,28 @@ return None; } + // If a hint/switch to use tail-folding is found, check early if folding the + // tail by masking is actually possible. Else, fallback to a scalar epilogue. + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { + LLVM_DEBUG(dbgs() << "LV: Vector predicate hint/switch found.\n"); + + if (Legal->prepareToFoldTailByMasking(/*ReportFailure=*/false)) + FoldTailByMasking = true; + else { + LLVM_DEBUG( + dbgs() << "LV: Loop does not support tail-folding, ignoring " + "hint/switch and falling back to a scalar epilogue.\n"); + Legal->abandonTailFoldingByMasking(); + ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; + } + } + switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: return computeFeasibleMaxVF(TC); case CM_ScalarEpilogueNotNeededUsePredicate: LLVM_DEBUG( - dbgs() << "LV: vector predicate hint/switch found.\n" - << "LV: Not allowing scalar epilogue, creating predicated " + dbgs() << "LV: Not allowing scalar epilogue, creating predicated " << "vector loop.\n"); break; case CM_ScalarEpilogueNotAllowedLowTripLoop: @@ -5008,9 +5023,18 @@ if (TC > 0 && TC % MaxVF == 0) { // Accept MaxVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); + // Abandon tail folding if needed, so we don't generate masked load/stores + // for this loop (they're not needed). + if (FoldTailByMasking) { + Legal->abandonTailFoldingByMasking(); + FoldTailByMasking = false; + } return MaxVF; } + if (FoldTailByMasking) + return MaxVF; + // If we don't know the precise trip count, or if the trip count that we // found modulo the vectorization factor is not zero, try to fold the tail // by masking. Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll @@ -0,0 +1,101 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -loop-vectorize -mattr=+armv8.1-m.main,+mve.fp -disable-mve-tail-predication=false < %s | FileCheck %s +; RUN: opt -S -loop-vectorize -mattr=+armv8.1-m.main,+mve.fp -disable-mve-tail-predication=true < %s | FileCheck %s + +; This test should produce the same result (vectorized loop + scalar epilogue) with +; default options and when MVE Tail Predication is enabled, as this loop's tail cannot be folded +; by masking due to an outside user of %incdec.ptr13 in %end. + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv8.1m.main-arm-unknown-eabihf" + +define void @outside_user_blocks_tail_folding(i8* nocapture readonly %ptr, i32 %size, i8** %pos) { +; CHECK-LABEL: @outside_user_blocks_tail_folding( +; CHECK-NEXT: header: +; CHECK-NEXT: [[PTR0:%.*]] = load i8*, i8** [[POS:%.*]], align 4 +; CHECK-NEXT: [[DEC62:%.*]] = add nsw i32 [[SIZE:%.*]], -1 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SIZE]], -1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SIZE]], -1 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* [[PTR0]], i32 [[TMP1]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i32 1 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[SIZE]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[PTR0]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* [[PTR0]], i32 [[N_VEC]] +; CHECK-NEXT: [[IND_END4:%.*]] = sub i32 [[DEC62]], [[N_VEC]] +; CHECK-NEXT: [[IND_END6:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[N_VEC]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR0]], i32 [[TMP2]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[DEC62]], [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP7]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1, !alias.scope !0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[WIDE_LOAD]], <16 x i8>* [[TMP10]], align 1, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR0]], [[HEADER:%.*]] ], [ [[PTR0]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END4]], [[MIDDLE_BLOCK]] ], [ [[DEC62]], [[HEADER]] ], [ [[DEC62]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i8* [ [[IND_END6]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[BODY:%.*]] +; CHECK: body: +; CHECK-NEXT: [[DOTPRE74:%.*]] = phi i8* [ [[INCDEC_PTR13:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[BUFF_06065:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL5]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF_06065]], i32 1 +; CHECK-NEXT: [[DEC]] = add nsw i32 [[DEC66]], -1 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1 +; CHECK-NEXT: [[INCDEC_PTR13]] = getelementptr inbounds i8, i8* [[DOTPRE74]], i32 1 +; CHECK-NEXT: store i8 [[TMP12]], i8* [[DOTPRE74]], align 1 +; CHECK-NEXT: [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0 +; CHECK-NEXT: br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop !7 +; CHECK: end: +; CHECK-NEXT: [[INCDEC_PTR13_LCSSA:%.*]] = phi i8* [ [[INCDEC_PTR13]], [[BODY]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: store i8* [[INCDEC_PTR13_LCSSA]], i8** [[POS]], align 4 +; CHECK-NEXT: ret void +; + +header: + %ptr0 = load i8*, i8** %pos, align 4 + %dec62 = add nsw i32 %size, -1 + br label %body + +body: + %.pre74 = phi i8* [ %incdec.ptr13, %body ], [ %ptr0, %header ] + %dec66 = phi i32 [ %dec, %body ], [ %dec62, %header ] + %buff.06065 = phi i8* [ %incdec.ptr, %body ], [ %ptr, %header ] + %incdec.ptr = getelementptr inbounds i8, i8* %buff.06065, i32 1 + %dec = add nsw i32 %dec66, -1 + %0 = load i8, i8* %incdec.ptr, align 1 + %incdec.ptr13 = getelementptr inbounds i8, i8* %.pre74, i32 1 + store i8 %0, i8* %.pre74, align 1 + %tobool11 = icmp eq i32 %dec, 0 + br i1 %tobool11, label %end, label %body + +end: + store i8* %incdec.ptr13, i8** %pos, align 4 + ret void +} Index: llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll @@ -0,0 +1,99 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -loop-vectorize -prefer-predicate-over-epilog < %s | FileCheck %s +; RUN: opt -S -loop-vectorize < %s | FileCheck %s + +; This test should produce the same result when TP is forced/disabled, because it +; can't be tail-predicated (due to an outside user of %incdec.ptr13 in %end), +; so the vectorizer should fall back to a scalar epilogue. + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + +define void @outside_user_blocks_tail_folding(i8* nocapture readonly %ptr, i32 %size, i8** %pos) { +; CHECK-LABEL: @outside_user_blocks_tail_folding( +; CHECK-NEXT: header: +; CHECK-NEXT: [[PTR0:%.*]] = load i8*, i8** [[POS:%.*]], align 4 +; CHECK-NEXT: [[DEC62:%.*]] = add nsw i32 [[SIZE:%.*]], -1 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SIZE]], -1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SIZE]], -1 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* [[PTR0]], i32 [[TMP1]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i32 1 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[SIZE]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[PTR0]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* [[PTR0]], i32 [[N_VEC]] +; CHECK-NEXT: [[IND_END4:%.*]] = sub i32 [[DEC62]], [[N_VEC]] +; CHECK-NEXT: [[IND_END6:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[N_VEC]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR0]], i32 [[TMP2]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[DEC62]], [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP7]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1, !alias.scope !0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <4 x i8>* +; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP10]], align 1, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR0]], [[HEADER:%.*]] ], [ [[PTR0]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END4]], [[MIDDLE_BLOCK]] ], [ [[DEC62]], [[HEADER]] ], [ [[DEC62]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i8* [ [[IND_END6]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[BODY:%.*]] +; CHECK: body: +; CHECK-NEXT: [[DOTPRE74:%.*]] = phi i8* [ [[INCDEC_PTR13:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[BUFF_06065:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL5]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF_06065]], i32 1 +; CHECK-NEXT: [[DEC]] = add nsw i32 [[DEC66]], -1 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1 +; CHECK-NEXT: [[INCDEC_PTR13]] = getelementptr inbounds i8, i8* [[DOTPRE74]], i32 1 +; CHECK-NEXT: store i8 [[TMP12]], i8* [[DOTPRE74]], align 1 +; CHECK-NEXT: [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0 +; CHECK-NEXT: br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop !7 +; CHECK: end: +; CHECK-NEXT: [[INCDEC_PTR13_LCSSA:%.*]] = phi i8* [ [[INCDEC_PTR13]], [[BODY]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: store i8* [[INCDEC_PTR13_LCSSA]], i8** [[POS]], align 4 +; CHECK-NEXT: ret void +; +header: + %ptr0 = load i8*, i8** %pos, align 4 + %dec62 = add nsw i32 %size, -1 + br label %body + +body: + %.pre74 = phi i8* [ %incdec.ptr13, %body ], [ %ptr0, %header ] + %dec66 = phi i32 [ %dec, %body ], [ %dec62, %header ] + %buff.06065 = phi i8* [ %incdec.ptr, %body ], [ %ptr, %header ] + %incdec.ptr = getelementptr inbounds i8, i8* %buff.06065, i32 1 + %dec = add nsw i32 %dec66, -1 + %0 = load i8, i8* %incdec.ptr, align 1 + %incdec.ptr13 = getelementptr inbounds i8, i8* %.pre74, i32 1 + store i8 %0, i8* %.pre74, align 1 + %tobool11 = icmp eq i32 %dec, 0 + br i1 %tobool11, label %end, label %body + +end: + store i8* %incdec.ptr13, i8** %pos, align 4 + ret void +}