Page MenuHomePhabricator

Bad SLPVectorization shufflevector replacement, resulting in write to wrong memory location
Needs ReviewPublic

Authored by vtjnash on Jul 22 2021, 4:48 PM.

Details

Summary

I am not yet sure how to test it, since my example case (in the commit comment) was broken by the cost model change in 0d74fd3fdf50. Revert that to see this bad code transform I described there in brief to happen. Suggestions would be helpful!

Diff Detail

Unit TestsFailed

TimeTest
2,730 msx64 debian > libarcher.barrier::barrier.c
Script: -- : 'RUN: at line 14'; /var/lib/buildkite-agent/builds/llvm-project/build/./bin/clang -fopenmp -pthread -fno-experimental-isel -g -O1 -fsanitize=thread -I /var/lib/buildkite-agent/builds/llvm-project/openmp/tools/archer/tests -I /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/runtime/src -L /var/lib/buildkite-agent/builds/llvm-project/build/lib -Wl,-rpath,/var/lib/buildkite-agent/builds/llvm-project/build/lib /var/lib/buildkite-agent/builds/llvm-project/openmp/tools/archer/tests/barrier/barrier.c -o /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/tools/archer/tests/barrier/Output/barrier.c.tmp -latomic && env TSAN_OPTIONS='ignore_noninstrumented_modules=0:ignore_noninstrumented_modules=1' /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/tools/archer/tests/barrier/Output/barrier.c.tmp 2>&1 | tee /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/tools/archer/tests/barrier/Output/barrier.c.tmp.log | /var/lib/buildkite-agent/builds/llvm-project/build/./bin/FileCheck /var/lib/buildkite-agent/builds/llvm-project/openmp/tools/archer/tests/barrier/barrier.c
2,520 msx64 debian > libarcher.critical::critical.c
Script: -- : 'RUN: at line 15'; /var/lib/buildkite-agent/builds/llvm-project/build/./bin/clang -fopenmp -pthread -fno-experimental-isel -g -O1 -fsanitize=thread -I /var/lib/buildkite-agent/builds/llvm-project/openmp/tools/archer/tests -I /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/runtime/src -L /var/lib/buildkite-agent/builds/llvm-project/build/lib -Wl,-rpath,/var/lib/buildkite-agent/builds/llvm-project/build/lib /var/lib/buildkite-agent/builds/llvm-project/openmp/tools/archer/tests/critical/critical.c -o /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/tools/archer/tests/critical/Output/critical.c.tmp -latomic && env TSAN_OPTIONS='ignore_noninstrumented_modules=0:ignore_noninstrumented_modules=1' /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/tools/archer/tests/critical/Output/critical.c.tmp 2>&1 | tee /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/tools/archer/tests/critical/Output/critical.c.tmp.log | /var/lib/buildkite-agent/builds/llvm-project/build/./bin/FileCheck /var/lib/buildkite-agent/builds/llvm-project/openmp/tools/archer/tests/critical/critical.c
2,500 msx64 debian > libarcher.parallel::parallel-simple2.c
Script: -- : 'RUN: at line 15'; /var/lib/buildkite-agent/builds/llvm-project/build/./bin/clang -fopenmp -pthread -fno-experimental-isel -g -O1 -fsanitize=thread -I /var/lib/buildkite-agent/builds/llvm-project/openmp/tools/archer/tests -I /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/runtime/src -L /var/lib/buildkite-agent/builds/llvm-project/build/lib -Wl,-rpath,/var/lib/buildkite-agent/builds/llvm-project/build/lib /var/lib/buildkite-agent/builds/llvm-project/openmp/tools/archer/tests/parallel/parallel-simple2.c -o /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/tools/archer/tests/parallel/Output/parallel-simple2.c.tmp -latomic && env TSAN_OPTIONS='ignore_noninstrumented_modules=0:ignore_noninstrumented_modules=1' /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/tools/archer/tests/parallel/Output/parallel-simple2.c.tmp 2>&1 | tee /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/tools/archer/tests/parallel/Output/parallel-simple2.c.tmp.log | /var/lib/buildkite-agent/builds/llvm-project/build/./bin/FileCheck /var/lib/buildkite-agent/builds/llvm-project/openmp/tools/archer/tests/parallel/parallel-simple2.c
2,670 msx64 debian > libarcher.races::critical-unrelated.c
Script: -- : 'RUN: at line 13'; /var/lib/buildkite-agent/builds/llvm-project/build/./bin/clang -fopenmp -pthread -fno-experimental-isel -g -O1 -fsanitize=thread -I /var/lib/buildkite-agent/builds/llvm-project/openmp/tools/archer/tests -I /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/runtime/src -L /var/lib/buildkite-agent/builds/llvm-project/build/lib -Wl,-rpath,/var/lib/buildkite-agent/builds/llvm-project/build/lib /var/lib/buildkite-agent/builds/llvm-project/openmp/tools/archer/tests/races/critical-unrelated.c -o /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/tools/archer/tests/races/Output/critical-unrelated.c.tmp -latomic && env TSAN_OPTIONS='ignore_noninstrumented_modules=0:ignore_noninstrumented_modules=1' /var/lib/buildkite-agent/builds/llvm-project/openmp/tools/archer/tests/deflake.bash /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/tools/archer/tests/races/Output/critical-unrelated.c.tmp 2>&1 | tee /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/tools/archer/tests/races/Output/critical-unrelated.c.tmp.log | /var/lib/buildkite-agent/builds/llvm-project/build/./bin/FileCheck /var/lib/buildkite-agent/builds/llvm-project/openmp/tools/archer/tests/races/critical-unrelated.c
2,560 msx64 debian > libarcher.races::lock-nested-unrelated.c
Script: -- : 'RUN: at line 13'; /var/lib/buildkite-agent/builds/llvm-project/build/./bin/clang -fopenmp -pthread -fno-experimental-isel -g -O1 -fsanitize=thread -I /var/lib/buildkite-agent/builds/llvm-project/openmp/tools/archer/tests -I /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/runtime/src -L /var/lib/buildkite-agent/builds/llvm-project/build/lib -Wl,-rpath,/var/lib/buildkite-agent/builds/llvm-project/build/lib /var/lib/buildkite-agent/builds/llvm-project/openmp/tools/archer/tests/races/lock-nested-unrelated.c -o /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/tools/archer/tests/races/Output/lock-nested-unrelated.c.tmp -latomic && env TSAN_OPTIONS='ignore_noninstrumented_modules=0:ignore_noninstrumented_modules=1' /var/lib/buildkite-agent/builds/llvm-project/openmp/tools/archer/tests/deflake.bash /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/tools/archer/tests/races/Output/lock-nested-unrelated.c.tmp 2>&1 | tee /var/lib/buildkite-agent/builds/llvm-project/build/projects/openmp/tools/archer/tests/races/Output/lock-nested-unrelated.c.tmp.log | /var/lib/buildkite-agent/builds/llvm-project/build/./bin/FileCheck /var/lib/buildkite-agent/builds/llvm-project/openmp/tools/archer/tests/races/lock-nested-unrelated.c
View Full Test Results (18 Failed)

Event Timeline

vtjnash created this revision.Jul 22 2021, 4:48 PM
vtjnash requested review of this revision.Jul 22 2021, 4:48 PM
Herald added a project: Restricted Project. · View Herald TranscriptJul 22 2021, 4:48 PM
vtjnash retitled this revision from Bad SLPVectorization shufflevector replacement to Bad SLPVectorization shufflevector replacement, resulting in write to wrong memory location.Jul 22 2021, 4:56 PM
vtjnash edited the summary of this revision. (Show Details)
vtjnash added reviewers: ABataev, spatel, vdmitrie.

I am not yet sure how to test it, since my example case (in the commit comment) was broken by the cost model change in 0d74fd3fdf50.

Can you actually link to it? I can't find it.

Revert that to see this bad code transform I described there in brief to happen. Suggestions would be helpful!

What does bad code transform mean?

vchuravy added a project: Restricted Project.Jul 23 2021, 2:53 AM
vchuravy added a subscriber: vchuravy.
ABataev added inline comments.Jul 23 2021, 7:40 AM
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
5421

This is definitely wrong, the third arg must be the lane id in the current entry node, not in the user node. But (not sure, looks like) it points to a problem with scatter vectorize pointers, which were vectorized previously. Will check it later.

vtjnash added a subscriber: Restricted Project.Jul 23 2021, 10:16 AM
vtjnash added inline comments.Jul 23 2021, 10:29 AM
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
5421

I think that makes sense. I was hoping this would break a test, so I could see where this fix attempt was wrong, but apparently no tests currently depended upon that. This was a confusing example, where it vectorized the load with a scatter-gather (offsets 10 and 4), but later vectorized the store using lane 1 of that combined gep (desiring to extract the pointer with gep offsets 4 and 6), but the 0 here means that store got the gep with offset 10 (and 12) back instead when it tried to get PO back out of the scatter-gather for insertion into that VecPtr user.

ABataev added inline comments.Jul 23 2021, 10:36 AM
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
5421

I think here we need an extra check for ScatterVectorize nodes. For such nodes, we need to extract all used pointers, not only the first one. Going to try to look at it later.

vtjnash added inline comments.Aug 19 2021, 11:30 AM
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
5421

Bump? If this was fixed, could you let me know the commit #, so I could test with it. Thanks!

ABataev added inline comments.Aug 19 2021, 11:33 AM
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
5421

Not yet, busy with some other patches, hope to investigate it soon.

ABataev added inline comments.Fri, Aug 27, 12:07 PM
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
5421

Hi, could you send your reproducer? Even if it is not triggered anymore?

The full reproducer is in the commit message on phabricator for this review, though I don't know how to link to the text of a commit specifically. Here's the text of that:

With opt -mcpu=haswell -slp-vectorizer, we see that it might currently produce:

%10 = getelementptr {}**, <2 x {}***> %9, <2 x i32> <i32 10, i32 4>
%11 = bitcast <2 x {}***> %10 to <2 x i64*>
...
%27 = extractelement <2 x i64*> %11, i32 0
%28 = bitcast i64* %27 to <2 x i64>*
store <2 x i64> %22, <2 x i64>* %28, align 4, !tbaa !2

Which is an out-of-bounds store (the extractelement got offset 10
instead of offset 4 as intended). With the fix, we correctly generate
extractelement for i32 1 and generate correct code.

; ModuleID = 'rand3.ll'
source_filename = "rand"
target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-f80:32-n8:16:32-S128-ni:10:11:12:13"
target triple = "i686-unknown-linux-gnu"

@llvm.compiler.used = appending global [3 x i8*] [i8* bitcast (void ({} addrspace(10)*)* @jl_gc_queue_root to i8*), i8* bitcast ({} addrspace(10)* (i8*, i32, i32)* @jl_gc_pool_alloc to i8*), i8* bitcast ({} addrspace(10)* (i8*, i32)* @jl_gc_big_alloc to i8*)], section "llvm.metadata"

; Function Attrs: sspstrong
define void @julia_rand_5(i64* noalias nocapture sret(i64) %0) #0 {
top:
%1 = call {}*** @julia.get_pgcstack()
%2 = getelementptr {}**, {}*** %1, i32 4
%3 = bitcast {}*** %2 to i64*
%4 = load i64, i64* %3, align 4, !tbaa !2
%5 = getelementptr {}**, {}*** %1, i32 6
%6 = bitcast {}*** %5 to i64*
%7 = load i64, i64* %6, align 4, !tbaa !2
%8 = getelementptr {}**, {}*** %1, i32 8
%9 = bitcast {}*** %8 to i64*
%10 = load i64, i64* %9, align 4, !tbaa !2
%11 = getelementptr {}**, {}*** %1, i32 10
%12 = bitcast {}*** %11 to i64*
%13 = load i64, i64* %12, align 4, !tbaa !2
%14 = add i64 %13, %4
%15 = call i64 @llvm.fshl.i64(i64 %14, i64 %14, i64 23)
%16 = shl i64 %7, 17
%17 = xor i64 %10, %4
%18 = xor i64 %13, %7
%19 = xor i64 %17, %7
%20 = xor i64 %18, %4
%21 = xor i64 %17, %16
%22 = call i64 @llvm.fshl.i64(i64 %18, i64 %18, i64 45)
store i64 %20, i64* %3, align 4, !tbaa !2
store i64 %19, i64* %6, align 4, !tbaa !2
store i64 %21, i64* %9, align 4, !tbaa !2
store i64 %22, i64* %12, align 4, !tbaa !2
store i64 %15, i64* %0, align 4
ret void
}

define nonnull {} addrspace(10)* @jfptr_rand_6({} addrspace(10)* %0, {} addrspace(10)** %1, i32 %2) #1 {
top:
%3 = call {}*** @julia.get_pgcstack()
%4 = alloca i64, align 8
call void @julia_rand_5(i64* noalias nocapture nonnull sret(i64) %4) #5
%5 = load i64, i64* %4, align 8, !tbaa !7
%6 = call nonnull {} addrspace(10)* @jl_box_uint64(i64 zeroext %5)
ret {} addrspace(10)* %6
}

declare {}*** @julia.get_pgcstack()

declare nonnull {} addrspace(10)* @jl_box_uint64(i64 zeroext)

; Function Attrs: inaccessiblemem_or_argmemonly
declare void @jl_gc_queue_root({} addrspace(10)*) #2

; Function Attrs: allocsize(1)
declare noalias nonnull {} addrspace(10)* @jl_gc_pool_alloc(i8*, i32, i32) #3

; Function Attrs: allocsize(1)
declare noalias nonnull {} addrspace(10)* @jl_gc_big_alloc(i8*, i32) #3

; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
declare i64 @llvm.fshl.i64(i64, i64, i64) #4

attributes #0 = { sspstrong "probe-stack"="inline-asm" }
attributes #1 = { "probe-stack"="inline-asm" "thunk" }
attributes #2 = { inaccessiblemem_or_argmemonly }
attributes #3 = { allocsize(1) }
attributes #4 = { nofree nosync nounwind readnone speculatable willreturn }
attributes #5 = { "probe-stack"="inline-asm" }

!llvm.module.flags = !{!0, !1}

!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{!3, !3, i64 0}
!3 = !{!"jtbaa_value", !4, i64 0}
!4 = !{!"jtbaa_data", !5, i64 0}
!5 = !{!"jtbaa", !6, i64 0}
!6 = !{!"jtbaa"}
!7 = !{!8, !8, i64 0}
!8 = !{!"jtbaa_stack", !5, i64 0}

bump

Hi, just returned back from vacation, will investigate it ASAP.

The full reproducer is in the commit message on phabricator for this review, though I don't know how to link to the text of a commit specifically. Here's the text of that:

With opt -mcpu=haswell -slp-vectorizer, we see that it might currently produce:

%10 = getelementptr {}**, <2 x {}***> %9, <2 x i32> <i32 10, i32 4>
%11 = bitcast <2 x {}***> %10 to <2 x i64*>
...
%27 = extractelement <2 x i64*> %11, i32 0
%28 = bitcast i64* %27 to <2 x i64>*
store <2 x i64> %22, <2 x i64>* %28, align 4, !tbaa !2

Which is an out-of-bounds store (the extractelement got offset 10
instead of offset 4 as intended). With the fix, we correctly generate
extractelement for i32 1 and generate correct code.

; ModuleID = 'rand3.ll'
source_filename = "rand"
target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-f80:32-n8:16:32-S128-ni:10:11:12:13"
target triple = "i686-unknown-linux-gnu"

@llvm.compiler.used = appending global [3 x i8*] [i8* bitcast (void ({} addrspace(10)*)* @jl_gc_queue_root to i8*), i8* bitcast ({} addrspace(10)* (i8*, i32, i32)* @jl_gc_pool_alloc to i8*), i8* bitcast ({} addrspace(10)* (i8*, i32)* @jl_gc_big_alloc to i8*)], section "llvm.metadata"

; Function Attrs: sspstrong
define void @julia_rand_5(i64* noalias nocapture sret(i64) %0) #0 {
top:
%1 = call {}*** @julia.get_pgcstack()
%2 = getelementptr {}**, {}*** %1, i32 4
%3 = bitcast {}*** %2 to i64*
%4 = load i64, i64* %3, align 4, !tbaa !2
%5 = getelementptr {}**, {}*** %1, i32 6
%6 = bitcast {}*** %5 to i64*
%7 = load i64, i64* %6, align 4, !tbaa !2
%8 = getelementptr {}**, {}*** %1, i32 8
%9 = bitcast {}*** %8 to i64*
%10 = load i64, i64* %9, align 4, !tbaa !2
%11 = getelementptr {}**, {}*** %1, i32 10
%12 = bitcast {}*** %11 to i64*
%13 = load i64, i64* %12, align 4, !tbaa !2
%14 = add i64 %13, %4
%15 = call i64 @llvm.fshl.i64(i64 %14, i64 %14, i64 23)
%16 = shl i64 %7, 17
%17 = xor i64 %10, %4
%18 = xor i64 %13, %7
%19 = xor i64 %17, %7
%20 = xor i64 %18, %4
%21 = xor i64 %17, %16
%22 = call i64 @llvm.fshl.i64(i64 %18, i64 %18, i64 45)
store i64 %20, i64* %3, align 4, !tbaa !2
store i64 %19, i64* %6, align 4, !tbaa !2
store i64 %21, i64* %9, align 4, !tbaa !2
store i64 %22, i64* %12, align 4, !tbaa !2
store i64 %15, i64* %0, align 4
ret void
}

define nonnull {} addrspace(10)* @jfptr_rand_6({} addrspace(10)* %0, {} addrspace(10)** %1, i32 %2) #1 {
top:
%3 = call {}*** @julia.get_pgcstack()
%4 = alloca i64, align 8
call void @julia_rand_5(i64* noalias nocapture nonnull sret(i64) %4) #5
%5 = load i64, i64* %4, align 8, !tbaa !7
%6 = call nonnull {} addrspace(10)* @jl_box_uint64(i64 zeroext %5)
ret {} addrspace(10)* %6
}

declare {}*** @julia.get_pgcstack()

declare nonnull {} addrspace(10)* @jl_box_uint64(i64 zeroext)

; Function Attrs: inaccessiblemem_or_argmemonly
declare void @jl_gc_queue_root({} addrspace(10)*) #2

; Function Attrs: allocsize(1)
declare noalias nonnull {} addrspace(10)* @jl_gc_pool_alloc(i8*, i32, i32) #3

; Function Attrs: allocsize(1)
declare noalias nonnull {} addrspace(10)* @jl_gc_big_alloc(i8*, i32) #3

; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
declare i64 @llvm.fshl.i64(i64, i64, i64) #4

attributes #0 = { sspstrong "probe-stack"="inline-asm" }
attributes #1 = { "probe-stack"="inline-asm" "thunk" }
attributes #2 = { inaccessiblemem_or_argmemonly }
attributes #3 = { allocsize(1) }
attributes #4 = { nofree nosync nounwind readnone speculatable willreturn }
attributes #5 = { "probe-stack"="inline-asm" }

!llvm.module.flags = !{!0, !1}

!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{!3, !3, i64 0}
!3 = !{!"jtbaa_value", !4, i64 0}
!4 = !{!"jtbaa_data", !5, i64 0}
!5 = !{!"jtbaa", !6, i64 0}
!6 = !{!"jtbaa"}
!7 = !{!8, !8, i64 0}
!8 = !{!"jtbaa_stack", !5, i64 0}

Unable to reproduce. This is what I get with trunk currently:

opt -S -mcpu=haswell -slp-vectorizer repro.ll -o -
; ModuleID = 'repro.ll'
source_filename = "rand"
target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-f80:32-n8:16:32-S128-ni:10:11:12:13"
target triple = "i686-unknown-linux-gnu"

@llvm.compiler.used = appending global [3 x i8*] [i8* bitcast (void ({} addrspace(10)*)* @jl_gc_queue_root to i8*), i8* bitcast ({} addrspace(10)* (i8*, i32, i32)* @jl_gc_pool_alloc to i8*), i8* bitcast ({} addrspace(10)* (i8*, i32)* @jl
_gc_big_alloc to i8*)], section "llvm.metadata"

; Function Attrs: sspstrong
define void @julia_rand_5(i64* noalias nocapture sret(i64) %0) #0 {
top:
  %1 = call {}*** @julia.get_pgcstack()
  %2 = getelementptr {}**, {}*** %1, i32 4
  %3 = bitcast {}*** %2 to i64*
  %4 = load i64, i64* %3, align 4, !tbaa !2
  %5 = getelementptr {}**, {}*** %1, i32 6
  %6 = bitcast {}*** %5 to i64*
  %7 = getelementptr {}**, {}*** %1, i32 8
  %8 = bitcast {}*** %7 to i64*
  %9 = bitcast i64* %6 to <2 x i64>*
  %10 = load <2 x i64>, <2 x i64>* %9, align 4, !tbaa !2
  %11 = getelementptr {}**, {}*** %1, i32 10
  %12 = bitcast {}*** %11 to i64*
  %13 = load i64, i64* %12, align 4, !tbaa !2
  %14 = add i64 %13, %4
  %15 = call i64 @llvm.fshl.i64(i64 %14, i64 %14, i64 23)
  %16 = extractelement <2 x i64> %10, i32 0
  %17 = shl i64 %16, 17
  %18 = insertelement <2 x i64> poison, i64 %13, i32 0
  %19 = insertelement <2 x i64> %18, i64 %4, i32 1
  %20 = xor <2 x i64> %19, %10
  %21 = insertelement <2 x i64> poison, i64 %4, i32 0
  %22 = insertelement <2 x i64> %21, i64 %16, i32 1
  %23 = xor <2 x i64> %20, %22
  %24 = extractelement <2 x i64> %20, i32 1
  %25 = xor i64 %24, %17
  %26 = extractelement <2 x i64> %20, i32 0
  %27 = call i64 @llvm.fshl.i64(i64 %26, i64 %26, i64 45)
  %28 = bitcast i64* %3 to <2 x i64>*
  store <2 x i64> %23, <2 x i64>* %28, align 4, !tbaa !2
  store i64 %25, i64* %8, align 4, !tbaa !2
  store i64 %27, i64* %12, align 4, !tbaa !2
  store i64 %15, i64* %0, align 4
  ret void
}

define nonnull {} addrspace(10)* @jfptr_rand_6({} addrspace(10)* %0, {} addrspace(10)** %1, i32 %2) #1 {
top:
  %3 = call {}*** @julia.get_pgcstack()
  %4 = alloca i64, align 8
  call void @julia_rand_5(i64* noalias nocapture nonnull sret(i64) %4) #6
  %5 = load i64, i64* %4, align 8, !tbaa !7
  %6 = call nonnull {} addrspace(10)* @jl_box_uint64(i64 zeroext %5)
  ret {} addrspace(10)* %6
}

declare {}*** @julia.get_pgcstack() #2

declare nonnull {} addrspace(10)* @jl_box_uint64(i64 zeroext) #2

; Function Attrs: inaccessiblemem_or_argmemonly
declare void @jl_gc_queue_root({} addrspace(10)*) #3

; Function Attrs: allocsize(1)
declare noalias nonnull {} addrspace(10)* @jl_gc_pool_alloc(i8*, i32, i32) #4

; Function Attrs: allocsize(1)
declare noalias nonnull {} addrspace(10)* @jl_gc_big_alloc(i8*, i32) #4

; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
declare i64 @llvm.fshl.i64(i64, i64, i64) #5

attributes #0 = { sspstrong "probe-stack"="inline-asm" "target-cpu"="haswell" }
attributes #1 = { "probe-stack"="inline-asm" "target-cpu"="haswell" "thunk" }
attributes #2 = { "target-cpu"="haswell" }
attributes #3 = { inaccessiblemem_or_argmemonly "target-cpu"="haswell" }
attributes #4 = { allocsize(1) "target-cpu"="haswell" }
attributes #5 = { nofree nosync nounwind readnone speculatable willreturn "target-cpu"="haswell" }
attributes #6 = { "probe-stack"="inline-asm" }

!llvm.module.flags = !{!0, !1}

!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{!3, !3, i64 0}
!3 = !{!"jtbaa_value", !4, i64 0}
!4 = !{!"jtbaa_data", !5, i64 0}
!5 = !{!"jtbaa", !6, i64 0}
!6 = !{!"jtbaa"}
!7 = !{!8, !8, i64 0}
!8 = !{!"jtbaa_stack", !5, i64 0}