$ cat t.ll ; ModuleID = '__compute_module' source_filename = "__compute_module" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-grtev4-linux-gnu" ; Function Attrs: nofree norecurse nounwind uwtable define void @jaxpr_computation.15(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 { entry: %0 = getelementptr inbounds i8*, i8** %buffer_table, i64 2 %1 = bitcast i8** %0 to [2 x [4 x i32]]** %2 = load [2 x [4 x i32]]*, [2 x [4 x i32]]** %1, align 8, !invariant.load !0, !dereferenceable !1, !align !2 %3 = getelementptr inbounds i8*, i8** %buffer_table, i64 3 %4 = bitcast i8** %3 to [1 x [4 x i32]]** %5 = load [1 x [4 x i32]]*, [1 x [4 x i32]]** %4, align 8, !invariant.load !0, !dereferenceable !3, !align !2 %6 = getelementptr inbounds i8*, i8** %buffer_table, i64 1 %7 = bitcast i8** %6 to [2 x [3 x [4 x i32]]]** %8 = load [2 x [3 x [4 x i32]]]*, [2 x [3 x [4 x i32]]]** %7, align 8, !invariant.load !0, !dereferenceable !4, !align !2 %9 = load i8*, i8** %buffer_table, align 8, !invariant.load !0, !dereferenceable !4, !align !2 %10 = getelementptr inbounds [1 x [4 x i32]], [1 x [4 x i32]]* %5, i64 0, i64 0, i64 2 %11 = getelementptr inbounds [1 x [4 x i32]], [1 x [4 x i32]]* %5, i64 0, i64 0, i64 3 %12 = bitcast [1 x [4 x i32]]* %5 to <4 x i32>* %13 = load <4 x i32>, <4 x i32>* %12, align 8, !invariant.load !0, !noalias !5 %14 = icmp eq <4 x i32> %13, zeroinitializer %shuffle = shufflevector <4 x i1> %14, <4 x i1> undef, <8 x i32> %15 = bitcast [2 x [4 x i32]]* %2 to <4 x i32>* %16 = load <4 x i32>, <4 x i32>* %15, align 8, !alias.scope !9, !noalias !10 %shuffle7 = shufflevector <4 x i32> %16, <4 x i32> undef, <8 x i32> %17 = bitcast [2 x [3 x [4 x i32]]]* %8 to <8 x i32>* %18 = load <8 x i32>, <8 x i32>* %17, align 8, !invariant.load !0, !noalias !5 %19 = select <8 x i1> %shuffle, <8 x i32> %18, <8 x i32> %shuffle7 %20 = bitcast i8* %9 to <8 x i32>* store <8 x i32> %19, <8 x i32>* %20, align 8, !alias.scope !10, !noalias !11 %21 = getelementptr inbounds [2 x [3 x [4 x i32]]], [2 x [3 x [4 x i32]]]* %8, i64 0, i64 0, i64 2, i64 0 %22 = getelementptr inbounds i8, i8* %9, i64 32 %23 = extractelement <4 x i1> %14, i32 3 %24 = getelementptr inbounds [2 x [4 x i32]], [2 x [4 x i32]]* %2, i64 0, i64 1, i64 0 %25 = bitcast i32* %24 to <4 x i32>* %26 = load <4 x i32>, <4 x i32>* %25, align 8, !alias.scope !9, !noalias !10 %shuffle8 = shufflevector <4 x i32> %26, <4 x i32> undef, <8 x i32> %27 = bitcast i32* %21 to <8 x i32>* %28 = load <8 x i32>, <8 x i32>* %27, align 8, !invariant.load !0, !noalias !5 %29 = shufflevector <4 x i1> %14, <4 x i1> undef, <8 x i32> %30 = shufflevector <4 x i32> %16, <4 x i32> %26, <8 x i32> %31 = select <8 x i1> %29, <8 x i32> %28, <8 x i32> %30 %32 = bitcast i8* %22 to <8 x i32>* store <8 x i32> %31, <8 x i32>* %32, align 8, !alias.scope !10, !noalias !11 %33 = getelementptr inbounds [2 x [3 x [4 x i32]]], [2 x [3 x [4 x i32]]]* %8, i64 0, i64 1, i64 1, i64 0 %34 = getelementptr inbounds i8, i8* %9, i64 64 %35 = bitcast [1 x [4 x i32]]* %5 to <2 x i32>* %36 = load <2 x i32>, <2 x i32>* %35, align 8, !invariant.load !0, !noalias !5 %37 = icmp eq <2 x i32> %36, zeroinitializer %38 = load i32, i32* %10, align 8, !invariant.load !0, !noalias !5 %39 = icmp eq i32 %38, 0 %40 = load i32, i32* %11, align 4, !invariant.load !0, !noalias !5 %41 = icmp eq i32 %40, 0 %42 = bitcast i32* %33 to <8 x i32>* %43 = load <8 x i32>, <8 x i32>* %42, align 8, !invariant.load !0, !noalias !5 %44 = extractelement <2 x i1> %37, i32 0 %45 = insertelement <8 x i1> undef, i1 %44, i32 0 %46 = extractelement <2 x i1> %37, i32 1 %47 = insertelement <8 x i1> %45, i1 %46, i32 1 %48 = insertelement <8 x i1> %47, i1 %39, i32 2 %49 = insertelement <8 x i1> %48, i1 %23, i32 3 %50 = insertelement <8 x i1> %49, i1 %44, i32 4 %51 = insertelement <8 x i1> %50, i1 %46, i32 5 %52 = insertelement <8 x i1> %51, i1 %39, i32 6 %53 = insertelement <8 x i1> %52, i1 %41, i32 7 %54 = select <8 x i1> %53, <8 x i32> %43, <8 x i32> %shuffle8 %55 = bitcast i8* %34 to <8 x i32>* store <8 x i32> %54, <8 x i32>* %55, align 8, !alias.scope !10, !noalias !11 %56 = getelementptr inbounds i8*, i8** %buffer_table, i64 4 %57 = bitcast i8** %56 to [1 x i8*]** %58 = load [1 x i8*]*, [1 x i8*]** %57, align 8, !invariant.load !0, !dereferenceable !2, !align !2 %59 = getelementptr inbounds [1 x i8*], [1 x i8*]* %58, i64 0, i64 0 store i8* %9, i8** %59, align 8, !alias.scope !13, !noalias !10 ret void } attributes #0 = { nofree norecurse nounwind uwtable "no-frame-pointer-elim"="false" "no-signed-zeros-fp-math"="true" "reciprocal-estimates"="none" "unsafe-fp-math"="true" } !0 = !{} !1 = !{i64 32} !2 = !{i64 8} !3 = !{i64 16} !4 = !{i64 96} !5 = !{!6, !8} !6 = !{!"buffer: {index:0, offset:0, size:96}", !7} !7 = !{!"XLA global AA domain"} !8 = !{!"buffer: {index:2, offset:0, size:32}", !7} !9 = !{!8} !10 = !{!6} !11 = !{!8, !12} !12 = !{!"buffer: {index:4, offset:0, size:8}", !7} !13 = !{!12} $ # This yields wrong results $ cat broken.s jaxpr_computation.15: 0x00000000 mov rsi, qword ptr [rcx + 16] 0x00000004 mov rdi, qword ptr [rcx + 24] 0x00000008 mov rax, qword ptr [rcx] 0x0000000b mov rdx, qword ptr [rcx + 8] 0x0000000f vmovdqu xmm0, xmmword ptr [rdi] 0x00000013 vptestnmd k0, xmm0, xmm0 0x00000019 kshiftlb k1, k0, 4 0x0000001f korb k1, k0, k1 0x00000023 vmovdqu xmm0, xmmword ptr [rsi] 0x00000027 vinserti128 ymm1, ymm0, xmm0, 1 0x0000002d vmovdqu32 ymm1 {k1}, ymmword ptr [rdx] 0x00000033 vmovdqu ymmword ptr [rax], ymm1 0x00000037 vmovdqu xmm1, xmmword ptr [rsi + 16] 0x0000003c kshiftrb k0, k0, 3 0x00000042 vinserti128 ymm0, ymm0, xmm1, 1 0x00000048 vmovdqu32 ymm0 {k1}, ymmword ptr [rdx + 32] 0x0000004f vmovdqu ymmword ptr [rax + 32], ymm0 0x00000054 vinserti128 ymm0, ymm1, xmm1, 1 0x0000005a vmovq xmm1, qword ptr [rdi] 0x0000005e vptestnmd k3, xmm1, xmm1 0x00000064 cmp dword ptr [rdi + 8], 0 0x00000068 kshiftrb k2, k3, 1 0x0000006e sete sil 0x00000072 cmp dword ptr [rdi + 12], 0 0x00000076 sete dil 0x0000007a kmovd k1, esi 0x0000007e kshiftlb k4, k0, 7 0x00000084 kshiftrb k4, k4, 6 0x0000008a kxorb k4, k3, k4 0x0000008e kshiftrb k5, k4, 2 0x00000094 kxorb k5, k5, k1 0x00000098 kshiftlb k5, k5, 7 0x0000009e kshiftrb k5, k5, 5 0x000000a4 kxorb k4, k4, k5 0x000000a8 kshiftrb k5, k4, 3 0x000000ae kxorb k0, k5, k0 0x000000b2 kshiftlb k0, k0, 7 0x000000b8 kshiftrb k0, k0, 4 0x000000be kxorb k0, k4, k0 0x000000c2 kshiftrb k4, k0, 4 0x000000c8 kxorb k3, k4, k3 0x000000cc kshiftlb k3, k3, 7 0x000000d2 kshiftrb k3, k3, 3 0x000000d8 kxorb k0, k0, k3 0x000000dc kshiftrb k3, k0, 5 0x000000e2 kxorb k2, k3, k2 0x000000e6 kshiftlb k2, k2, 7 0x000000ec kshiftrb k2, k2, 2 0x000000f2 kxorb k0, k0, k2 0x000000f6 kshiftrb k2, k0, 6 0x000000fc kxorb k1, k2, k1 0x00000100 kshiftlb k1, k1, 6 0x00000106 kxorb k0, k0, k1 0x0000010a kshiftlb k0, k0, 1 0x00000110 kshiftrb k0, k0, 1 0x00000116 kmovd k1, edi 0x0000011a kshiftlb k1, k1, 7 0x00000120 korb k1, k0, k1 0x00000124 vmovdqu32 ymm0 {k1}, ymmword ptr [rdx + 64] 0x0000012b vmovdqu ymmword ptr [rax + 64], ymm0 0x00000130 mov rcx, qword ptr [rcx + 32] 0x00000134 mov qword ptr [rcx], rax 0x00000137 vzeroupper 0x0000013a ret $ # Now turned off insert1BitVector for the insert at index = 1 $ # This hides the issue $ cat works.s jaxpr_computation.15: 0x00000000 mov rsi, qword ptr [rcx + 16] 0x00000004 mov rdi, qword ptr [rcx + 24] 0x00000008 mov rax, qword ptr [rcx] 0x0000000b mov r8, qword ptr [rcx + 8] 0x0000000f vmovdqu xmm0, xmmword ptr [rdi] 0x00000013 vptestnmd k0, xmm0, xmm0 0x00000019 kshiftlb k1, k0, 4 0x0000001f korb k1, k0, k1 0x00000023 vmovdqu xmm0, xmmword ptr [rsi] 0x00000027 vinserti128 ymm1, ymm0, xmm0, 1 0x0000002d vmovdqu32 ymm1 {k1}, ymmword ptr [r8] 0x00000033 kshiftrb k0, k0, 3 0x00000039 vmovdqu ymmword ptr [rax], ymm1 0x0000003d vmovdqu xmm1, xmmword ptr [rsi + 16] 0x00000042 vinserti128 ymm0, ymm0, xmm1, 1 0x00000048 vmovdqu32 ymm0 {k1}, ymmword ptr [r8 + 32] 0x0000004f vmovdqu ymmword ptr [rax + 32], ymm0 0x00000054 vmovq xmm0, qword ptr [rdi] 0x00000058 vinserti128 ymm1, ymm1, xmm1, 1 0x0000005e vptestnmd k2, xmm0, xmm0 0x00000064 kshiftrb k1, k2, 1 0x0000006a kmovd esi, k1 0x0000006e cmp dword ptr [rdi + 8], 0 0x00000072 sete dl 0x00000075 cmp dword ptr [rdi + 12], 0 0x00000079 sete dil 0x0000007d kmovd k1, edx 0x00000081 vpmovm2w xmm0, k2 0x00000087 vpinsrw xmm0, xmm0, esi, 1 0x0000008c vpsllw xmm0, xmm0, 15 0x00000091 vpmovw2m k3, xmm0 0x00000097 kshiftrb k4, k3, 2 0x0000009d kxorb k4, k4, k1 0x000000a1 kshiftlb k4, k4, 7 0x000000a7 kshiftrb k4, k4, 5 0x000000ad kxorb k3, k3, k4 0x000000b1 kshiftrb k4, k3, 3 0x000000b7 kxorb k0, k4, k0 0x000000bb kshiftlb k0, k0, 7 0x000000c1 kshiftrb k0, k0, 4 0x000000c7 kxorb k0, k3, k0 0x000000cb kshiftrb k3, k0, 4 0x000000d1 kxorb k2, k3, k2 0x000000d5 kshiftlb k2, k2, 7 0x000000db kshiftrb k2, k2, 3 0x000000e1 kxorb k0, k0, k2 0x000000e5 kshiftrb k2, k0, 5 0x000000eb kmovd k3, esi 0x000000ef kxorb k2, k2, k3 0x000000f3 kshiftlb k2, k2, 7 0x000000f9 kshiftrb k2, k2, 2 0x000000ff kxorb k0, k0, k2 0x00000103 kshiftrb k2, k0, 6 0x00000109 kxorb k1, k2, k1 0x0000010d kshiftlb k1, k1, 6 0x00000113 kxorb k0, k0, k1 0x00000117 kshiftlb k0, k0, 1 0x0000011d kshiftrb k0, k0, 1 0x00000123 kmovd k1, edi 0x00000127 kshiftlb k1, k1, 7 0x0000012d korb k1, k0, k1 0x00000131 vmovdqu32 ymm1 {k1}, ymmword ptr [r8 + 64] 0x00000138 vmovdqu ymmword ptr [rax + 64], ymm1 0x0000013d mov rcx, qword ptr [rcx + 32] 0x00000141 mov qword ptr [rcx], rax 0x00000144 vzeroupper 0x00000147 ret