This is an archive of the discontinued LLVM Phabricator instance.

[LoopVectorizer] When estimating register usage, unused instructions may "end" another use.
ClosedPublic

Authored by rob.lougher on Nov 11 2016, 12:24 PM.

Download Raw Diff

Details

Reviewers

anemet
jmolloy
mkuper
mssimpso
hfinkel

Commits

rGb0905209ddeb: [LoopVectorizer] When estimating reg usage, unused insts may "end" another use
rL286969: [LoopVectorizer] When estimating reg usage, unused insts may "end" another use

Summary

Consider the following simple test-case:

unsigned foo(unsigned *a, unsigned *b, int n) {
  unsigned i, s1 = 0, s2 = 0;
  for(i = 0; i < n; i++) {
    s1 += a[i];
    s2 += b[i];
  }
  return s2 - s1;
}

If this is compiled with -g as follows:

$ clang -mavx -O2 -S -g -emit-llvm test.c

The loop vectorizer will vectorize the loop with a vectorization factor of 4 and will unroll it 2 times. In the following IR we can see there are 4 add instructions, and the index is updated by 8 (4 * 2):

vector.body:                                      ; preds = %vector.body.preheader, %vector.body
  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ], !dbg !34
  %vec.phi = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %vector.body.preheader ]
  %vec.phi18 = phi <4 x i32> [ %12, %vector.body ], [ zeroinitializer, %vector.body.preheader ]
  %vec.phi19 = phi <4 x i32> [ %5, %vector.body ], [ zeroinitializer, %vector.body.preheader ]
  %vec.phi20 = phi <4 x i32> [ %6, %vector.body ], [ zeroinitializer, %vector.body.preheader ]
  %1 = getelementptr inbounds i32, i32* %a, i64 %index, !dbg !32
  %2 = bitcast i32* %1 to <4 x i32>*, !dbg !32
  %wide.load = load <4 x i32>, <4 x i32>* %2, align 4, !dbg !32, !tbaa !36
  %3 = getelementptr i32, i32* %1, i64 4, !dbg !32
  %4 = bitcast i32* %3 to <4 x i32>*, !dbg !32
  %wide.load21 = load <4 x i32>, <4 x i32>* %4, align 4, !dbg !32, !tbaa !36
  %5 = add <4 x i32> %wide.load, %vec.phi19, !dbg !40
  %6 = add <4 x i32> %wide.load21, %vec.phi20, !dbg !40
  %7 = getelementptr inbounds i32, i32* %b, i64 %index, !dbg !41
  %8 = bitcast i32* %7 to <4 x i32>*, !dbg !41
  %wide.load22 = load <4 x i32>, <4 x i32>* %8, align 4, !dbg !41, !tbaa !36
  %9 = getelementptr i32, i32* %7, i64 4, !dbg !41
  %10 = bitcast i32* %9 to <4 x i32>*, !dbg !41
  %wide.load23 = load <4 x i32>, <4 x i32>* %10, align 4, !dbg !41, !tbaa !36
  %11 = add <4 x i32> %wide.load22, %vec.phi, !dbg !42
  %12 = add <4 x i32> %wide.load23, %vec.phi18, !dbg !42
  %index.next = add i64 %index, 8, !dbg !34
  %13 = icmp eq i64 %index.next, %n.vec, !dbg !34
  br i1 %13, label %middle.block, label %vector.body, !dbg !34, !llvm.loop !43

However, if it is compiled without -g the loop vectorizer will unroll the loop 4 times:

vector.body:                                      ; preds = %vector.body.preheader, %vector.body
  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ]
  %vec.phi = phi <4 x i32> [ %21, %vector.body ], [ zeroinitializer, %vector.body.preheader ]
  %vec.phi20 = phi <4 x i32> [ %22, %vector.body ], [ zeroinitializer, %vector.body.preheader ]
  %vec.phi21 = phi <4 x i32> [ %23, %vector.body ], [ zeroinitializer, %vector.body.preheader ]
  %vec.phi22 = phi <4 x i32> [ %24, %vector.body ], [ zeroinitializer, %vector.body.preheader ]
  %vec.phi23 = phi <4 x i32> [ %9, %vector.body ], [ zeroinitializer, %vector.body.preheader ]
  %vec.phi24 = phi <4 x i32> [ %10, %vector.body ], [ zeroinitializer, %vector.body.preheader ]
  %vec.phi25 = phi <4 x i32> [ %11, %vector.body ], [ zeroinitializer, %vector.body.preheader ]
  %vec.phi26 = phi <4 x i32> [ %12, %vector.body ], [ zeroinitializer, %vector.body.preheader ]
  %1 = getelementptr inbounds i32, i32* %a, i64 %index
  %2 = bitcast i32* %1 to <4 x i32>*
  %wide.load = load <4 x i32>, <4 x i32>* %2, align 4, !tbaa !1
  %3 = getelementptr i32, i32* %1, i64 4
  %4 = bitcast i32* %3 to <4 x i32>*
  %wide.load27 = load <4 x i32>, <4 x i32>* %4, align 4, !tbaa !1
  %5 = getelementptr i32, i32* %1, i64 8
  %6 = bitcast i32* %5 to <4 x i32>*
  %wide.load28 = load <4 x i32>, <4 x i32>* %6, align 4, !tbaa !1
  %7 = getelementptr i32, i32* %1, i64 12
  %8 = bitcast i32* %7 to <4 x i32>*
  %wide.load29 = load <4 x i32>, <4 x i32>* %8, align 4, !tbaa !1
  %9 = add <4 x i32> %wide.load, %vec.phi23
  %10 = add <4 x i32> %wide.load27, %vec.phi24
  %11 = add <4 x i32> %wide.load28, %vec.phi25
  %12 = add <4 x i32> %wide.load29, %vec.phi26
  %13 = getelementptr inbounds i32, i32* %b, i64 %index
  %14 = bitcast i32* %13 to <4 x i32>*
  %wide.load30 = load <4 x i32>, <4 x i32>* %14, align 4, !tbaa !1
  %15 = getelementptr i32, i32* %13, i64 4
  %16 = bitcast i32* %15 to <4 x i32>*
  %wide.load31 = load <4 x i32>, <4 x i32>* %16, align 4, !tbaa !1
  %17 = getelementptr i32, i32* %13, i64 8
  %18 = bitcast i32* %17 to <4 x i32>*
  %wide.load32 = load <4 x i32>, <4 x i32>* %18, align 4, !tbaa !1
  %19 = getelementptr i32, i32* %13, i64 12
  %20 = bitcast i32* %19 to <4 x i32>*
  %wide.load33 = load <4 x i32>, <4 x i32>* %20, align 4, !tbaa !1
  %21 = add <4 x i32> %wide.load30, %vec.phi
  %22 = add <4 x i32> %wide.load31, %vec.phi20
  %23 = add <4 x i32> %wide.load32, %vec.phi21
  %24 = add <4 x i32> %wide.load33, %vec.phi22
  %index.next = add i64 %index, 16
  %25 = icmp eq i64 %index.next, %n.vec
  br i1 %25, label %middle.block, label %vector.body, !llvm.loop !5

If we look at the debug output from the loop vectorizer we can see that the reason is the register usage estimation. This value affects the "interleave count" which becomes the loop unroll factor.

Without -g:

...
LV: The target has 16 registers
LV(REG): Calculating max register usage:
LV(REG): At #0 Interval # 0
LV(REG): At #1 Interval # 1
LV(REG): At #2 Interval # 2
LV(REG): At #3 Interval # 3
LV(REG): At #4 Interval # 4
LV(REG): At #5 Interval # 4
LV(REG): At #6 Interval # 3
LV(REG): At #7 Interval # 4
LV(REG): At #8 Interval # 4
LV(REG): At #9 Interval # 3
LV(REG): At #10 Interval # 3
LV(REG): VF = 4
LV(REG): Found max usage: 3
LV(REG): Found invariant usage: 1
LV(REG): LoopSize: 12
LV: Interleaving because of reductions.
LV: Found a vectorizable loop (4) in test.c
LV: Interleave Count is 4

With -g:

...
LV: The target has 16 registers
LV(REG): Calculating max register usage:
LV(REG): At #0 Interval # 0
LV(REG): At #1 Interval # 1
LV(REG): At #2 Interval # 2
LV(REG): At #3 Interval # 3
LV(REG): At #4 Interval # 4
LV(REG): At #5 Interval # 4
LV(REG): At #7 Interval # 5
LV(REG): At #8 Interval # 6
LV(REG): At #9 Interval # 6
LV(REG): At #11 Interval # 7
LV(REG): At #14 Interval # 8
LV(REG): VF = 4
LV(REG): Found max usage: 6
LV(REG): Found invariant usage: 1
LV(REG): LoopSize: 16
LV: Interleaving because of reductions.
LV: Found a vectorizable loop (4) in test.c:3:3
LV: Interleave Count is 2

The reason is due to a bug in how the register usage algorithm treats instructions whose value is not used within the loop (e.g. those that do not produce a value).

The algorithm first calculates the usages within the loop. It iterates over the instructions in order, and records at which instruction index each use ends (in fact, they're actually recorded against the next index, as this is when we want to delete them from the open intervals).

The algorithm then iterates over the instructions again, adding each instruction in turn to a list of open intervals. Instructions are then removed from the list of open intervals when they occur in the list of uses ended at the current index.

The problem is, instructions which are not used in the loop are skipped. However, although they aren't used, the last use of a value may have been recorded against that instruction index. In this case, the use is not deleted from the open intervals, which may then bump up the estimated register usage.

The IR for the loop with -g:

for.body:                                         ; preds = %for.body.preheader, %for.body
  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
  %s2.015 = phi i32 [ %add3, %for.body ], [ 0, %for.body.preheader ]
  %s1.014 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv, !dbg !32
  %0 = load i32, i32* %arrayidx, align 4, !dbg !32, !tbaa !34
  %add = add i32 %0, %s1.014, !dbg !38
  tail call void @llvm.dbg.value(metadata i32 %add, i64 0, metadata !17, metadata !19), !dbg !23
  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv, !dbg !39
  %1 = load i32, i32* %arrayidx2, align 4, !dbg !39, !tbaa !34
  %add3 = add i32 %1, %s2.015, !dbg !40
  tail call void @llvm.dbg.value(metadata i32 %add3, i64 0, metadata !18, metadata !19), !dbg !24
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !41
  tail call void @llvm.dbg.value(metadata i32 %add3, i64 0, metadata !18, metadata !19), !dbg !24
  tail call void @llvm.dbg.value(metadata i32 %add, i64 0, metadata !17, metadata !19), !dbg !23
  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count, !dbg !26
  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !30, !llvm.loop !43

We can see that we have 4 void calls to debug intrinsics. In the first two cases they are preceded by add instructions in which two values are last used (%0, %s1.014 and %1, %s2.015). We an also see in the LV debug output that without -g, the open interval size went from 4 to 3 between index #5 (the first add) and index #6, while with -g it increases from 4 to 5.

This patch fixes the issue by simply moving the "is used" check after the loop which erases the uses at the current index.

Note, one of the register usage tests had to be updated. In this test we have a store instruction which will be skipped as it does not produce a value. This is preceded by an add which contains the last use of %0.

OK to commit?

Thanks,
Rob.

P.S. The difference with and without -g was found using the check_cfc tool (review D8723).

Diff Detail

Repository: rL LLVM

Event Timeline

rob.lougher updated this revision to Diff 77643.Nov 11 2016, 12:24 PM

rob.lougher retitled this revision from to [LoopVectorizer] When estimating register usage, unused instructions may "end" another use..

rob.lougher updated this object.

rob.lougher added reviewers: anemet, jmolloy, hfinkel, mcrosier.

rob.lougher added subscribers: llvm-commits, andreadb, gbedwell.

Herald added a subscriber: mzolotukhin. · View Herald TranscriptNov 11 2016, 12:24 PM

mcrosier resigned from this revision.Nov 11 2016, 12:59 PM

mcrosier edited reviewers, added: mkuper, mssimpso; removed: mcrosier.

Thanks, LGTM.

This revision is now accepted and ready to land.Nov 13 2016, 3:52 PM

Closed by commit rL286969: [LoopVectorizer] When estimating reg usage, unused insts may "end" another use (authored by rlougher). · Explain WhyNov 15 2016, 6:37 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Transforms/

Vectorize/

LoopVectorize.cpp

7 lines

test/

Transforms/

LoopVectorize/

X86/

reg-usage-debug.ll

134 lines

reg-usage.ll

2 lines

Diff 77988

llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 6,463 Lines • ▼ Show 20 Lines	auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
if (Ty->isTokenTy())		if (Ty->isTokenTy())
return 0U;		return 0U;
unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());		unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
return std::max<unsigned>(1, VF * TypeSize / WidestRegister);		return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
};		};

for (unsigned int i = 0; i < Index; ++i) {		for (unsigned int i = 0; i < Index; ++i) {
Instruction *I = IdxToInstr[i];		Instruction *I = IdxToInstr[i];
// Ignore instructions that are never used within the loop.
if (!Ends.count(I))
continue;

// Remove all of the instructions that end at this location.		// Remove all of the instructions that end at this location.
InstrList &List = TransposeEnds[i];		InstrList &List = TransposeEnds[i];
for (Instruction *ToRemove : List)		for (Instruction *ToRemove : List)
OpenIntervals.erase(ToRemove);		OpenIntervals.erase(ToRemove);

		// Ignore instructions that are never used within the loop.
		if (!Ends.count(I))
		continue;

// Skip ignored values.		// Skip ignored values.
if (ValuesToIgnore.count(I))		if (ValuesToIgnore.count(I))
continue;		continue;

// For each VF find the maximum usage of registers.		// For each VF find the maximum usage of registers.
for (unsigned j = 0, e = VFs.size(); j < e; ++j) {		for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
if (VFs[j] == 1) {		if (VFs[j] == 1) {
MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());		MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
▲ Show 20 Lines • Show All 977 Lines • Show Last 20 Lines

llvm/trunk/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll

				; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -mtriple=x86_64-unknown-linux -S 2>&1 \| FileCheck %s
				; REQUIRES: asserts

				; Test that the register usage estimation is not affected by the presence of
				; debug intrinsics.
				;
				; In the test below the values %0 and %r.08 are ended in the add instruction
				; preceding the call to the intrinsic, and will be recorded against the index
				; of the call instruction. This means the debug intrinsic must be considered
				; when erasing instructions from the list of open-intervals.
				;
				; Tests generated from following source (with and without -g):

				; unsigned test(unsigned *a, unsigned n) {
				; unsigned i, r = 0;
				; for(i = 0; i < n; i++)
				; r += a[i];
				; return r;
				; }

				target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
				target triple = "x86_64-unknown-linux-gnu"

				; CHECK: LV: Checking a loop in "test_g"
				; CHECK: LV(REG): Found max usage: 2

				define i32 @test_g(i32* nocapture readonly %a, i32 %n) local_unnamed_addr !dbg !6 {
				entry:
				tail call void @llvm.dbg.value(metadata i32* %a, i64 0, metadata !12, metadata !16), !dbg !17
				tail call void @llvm.dbg.value(metadata i32 %n, i64 0, metadata !13, metadata !16), !dbg !18
				tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !16), !dbg !19
				tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !14, metadata !16), !dbg !20
				tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !16), !dbg !19
				tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !14, metadata !16), !dbg !20
				%cmp6 = icmp eq i32 %n, 0, !dbg !21
				br i1 %cmp6, label %for.end, label %for.body.preheader, !dbg !25

				for.body.preheader: ; preds = %entry
				%wide.trip.count = zext i32 %n to i64, !dbg !21
				br label %for.body, !dbg !27

				for.body: ; preds = %for.body.preheader, %for.body
				%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
				%r.08 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
				%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv, !dbg !27
				%0 = load i32, i32* %arrayidx, align 4, !dbg !27, !tbaa !28
				%add = add i32 %0, %r.08, !dbg !32
				tail call void @llvm.dbg.value(metadata i32 %add, i64 0, metadata !15, metadata !16), !dbg !19
				%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !33
				tail call void @llvm.dbg.value(metadata i32 %add, i64 0, metadata !15, metadata !16), !dbg !19
				%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count, !dbg !21
				br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !25, !llvm.loop !35

				for.end.loopexit: ; preds = %for.body
				br label %for.end, !dbg !38

				for.end: ; preds = %for.end.loopexit, %entry
				%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.end.loopexit ]
				ret i32 %r.0.lcssa, !dbg !38
				}

				; CHECK: LV: Checking a loop in "test"
				; CHECK: LV(REG): Found max usage: 2

				define i32 @test(i32* nocapture readonly %a, i32 %n) local_unnamed_addr {
				entry:
				%cmp6 = icmp eq i32 %n, 0
				br i1 %cmp6, label %for.end, label %for.body.preheader

				for.body.preheader: ; preds = %entry
				%wide.trip.count = zext i32 %n to i64
				br label %for.body

				for.body: ; preds = %for.body.preheader, %for.body
				%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
				%r.08 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
				%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
				%0 = load i32, i32* %arrayidx, align 4, !tbaa !28
				%add = add i32 %0, %r.08
				%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
				%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
				br i1 %exitcond, label %for.end.loopexit, label %for.body

				for.end.loopexit: ; preds = %for.body
				br label %for.end

				for.end: ; preds = %for.end.loopexit, %entry
				%r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.end.loopexit ]
				ret i32 %r.0.lcssa
				}

				declare void @llvm.dbg.value(metadata, i64, metadata, metadata)

				!llvm.dbg.cu = !{!0}
				!llvm.module.flags = !{!3, !4}

				!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
				!1 = !DIFile(filename: "test.c", directory: "")
				!2 = !{}
				!3 = !{i32 2, !"Dwarf Version", i32 4}
				!4 = !{i32 2, !"Debug Info Version", i32 3}
				!6 = distinct !DISubprogram(name: "test_g", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !11)
				!7 = !DISubroutineType(types: !8)
				!8 = !{!9, !10, !9}
				!9 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
				!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64)
				!11 = !{!12, !13, !14, !15}
				!12 = !DILocalVariable(name: "a", arg: 1, scope: !6, file: !1, line: 1, type: !10)
				!13 = !DILocalVariable(name: "n", arg: 2, scope: !6, file: !1, line: 1, type: !9)
				!14 = !DILocalVariable(name: "i", scope: !6, file: !1, line: 2, type: !9)
				!15 = !DILocalVariable(name: "r", scope: !6, file: !1, line: 2, type: !9)
				!16 = !DIExpression()
				!17 = !DILocation(line: 1, column: 27, scope: !6)
				!18 = !DILocation(line: 1, column: 39, scope: !6)
				!19 = !DILocation(line: 2, column: 15, scope: !6)
				!20 = !DILocation(line: 2, column: 12, scope: !6)
				!21 = !DILocation(line: 3, column: 16, scope: !22)
				!22 = !DILexicalBlockFile(scope: !23, file: !1, discriminator: 1)
				!23 = distinct !DILexicalBlock(scope: !24, file: !1, line: 3, column: 3)
				!24 = distinct !DILexicalBlock(scope: !6, file: !1, line: 3, column: 3)
				!25 = !DILocation(line: 3, column: 3, scope: !26)
				!26 = !DILexicalBlockFile(scope: !24, file: !1, discriminator: 1)
				!27 = !DILocation(line: 4, column: 10, scope: !23)
				!28 = !{!29, !29, i64 0}
				!29 = !{!"int", !30, i64 0}
				!30 = !{!"omnipotent char", !31, i64 0}
				!31 = !{!"Simple C/C++ TBAA"}
				!32 = !DILocation(line: 4, column: 7, scope: !23)
				!33 = !DILocation(line: 3, column: 22, scope: !34)
				!34 = !DILexicalBlockFile(scope: !23, file: !1, discriminator: 2)
				!35 = distinct !{!35, !36, !37}
				!36 = !DILocation(line: 3, column: 3, scope: !24)
				!37 = !DILocation(line: 4, column: 13, scope: !24)
				!38 = !DILocation(line: 5, column: 3, scope: !6)

llvm/trunk/test/Transforms/LoopVectorize/X86/reg-usage.ll

Show First 20 Lines • Show All 79 Lines • ▼ Show 20 Lines	for.body: ; preds = %for.body, %entry
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1		%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1024		%exitcond = icmp eq i64 %indvars.iv.next, 1024
br i1 %exitcond, label %for.cond.cleanup, label %for.body		br i1 %exitcond, label %for.cond.cleanup, label %for.body
}		}

define i64 @bar(i64* nocapture %a) {		define i64 @bar(i64* nocapture %a) {
; CHECK-LABEL: bar		; CHECK-LABEL: bar
; CHECK: LV(REG): VF = 2		; CHECK: LV(REG): VF = 2
; CHECK: LV(REG): Found max usage: 4		; CHECK: LV(REG): Found max usage: 3
;		;
entry:		entry:
br label %for.body		br label %for.body

for.cond.cleanup:		for.cond.cleanup:
%add2.lcssa = phi i64 [ %add2, %for.body ]		%add2.lcssa = phi i64 [ %add2, %for.body ]
ret i64 %add2.lcssa		ret i64 %add2.lcssa

▲ Show 20 Lines • Show All 43 Lines • Show Last 20 Lines