Page MenuHomePhabricator
Paste P8166

AVX512 bug
ActivePublic

Authored by bkramer on Oct 1 2019, 12:03 PM.
$ cat t.ll
; ModuleID = '__compute_module'
source_filename = "__compute_module"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"
; Function Attrs: nofree norecurse nounwind uwtable
define void @jaxpr_computation.15(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
entry:
%0 = getelementptr inbounds i8*, i8** %buffer_table, i64 2
%1 = bitcast i8** %0 to [2 x [4 x i32]]**
%2 = load [2 x [4 x i32]]*, [2 x [4 x i32]]** %1, align 8, !invariant.load !0, !dereferenceable !1, !align !2
%3 = getelementptr inbounds i8*, i8** %buffer_table, i64 3
%4 = bitcast i8** %3 to [1 x [4 x i32]]**
%5 = load [1 x [4 x i32]]*, [1 x [4 x i32]]** %4, align 8, !invariant.load !0, !dereferenceable !3, !align !2
%6 = getelementptr inbounds i8*, i8** %buffer_table, i64 1
%7 = bitcast i8** %6 to [2 x [3 x [4 x i32]]]**
%8 = load [2 x [3 x [4 x i32]]]*, [2 x [3 x [4 x i32]]]** %7, align 8, !invariant.load !0, !dereferenceable !4, !align !2
%9 = load i8*, i8** %buffer_table, align 8, !invariant.load !0, !dereferenceable !4, !align !2
%10 = getelementptr inbounds [1 x [4 x i32]], [1 x [4 x i32]]* %5, i64 0, i64 0, i64 2
%11 = getelementptr inbounds [1 x [4 x i32]], [1 x [4 x i32]]* %5, i64 0, i64 0, i64 3
%12 = bitcast [1 x [4 x i32]]* %5 to <4 x i32>*
%13 = load <4 x i32>, <4 x i32>* %12, align 8, !invariant.load !0, !noalias !5
%14 = icmp eq <4 x i32> %13, zeroinitializer
%shuffle = shufflevector <4 x i1> %14, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
%15 = bitcast [2 x [4 x i32]]* %2 to <4 x i32>*
%16 = load <4 x i32>, <4 x i32>* %15, align 8, !alias.scope !9, !noalias !10
%shuffle7 = shufflevector <4 x i32> %16, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
%17 = bitcast [2 x [3 x [4 x i32]]]* %8 to <8 x i32>*
%18 = load <8 x i32>, <8 x i32>* %17, align 8, !invariant.load !0, !noalias !5
%19 = select <8 x i1> %shuffle, <8 x i32> %18, <8 x i32> %shuffle7
%20 = bitcast i8* %9 to <8 x i32>*
store <8 x i32> %19, <8 x i32>* %20, align 8, !alias.scope !10, !noalias !11
%21 = getelementptr inbounds [2 x [3 x [4 x i32]]], [2 x [3 x [4 x i32]]]* %8, i64 0, i64 0, i64 2, i64 0
%22 = getelementptr inbounds i8, i8* %9, i64 32
%23 = extractelement <4 x i1> %14, i32 3
%24 = getelementptr inbounds [2 x [4 x i32]], [2 x [4 x i32]]* %2, i64 0, i64 1, i64 0
%25 = bitcast i32* %24 to <4 x i32>*
%26 = load <4 x i32>, <4 x i32>* %25, align 8, !alias.scope !9, !noalias !10
%shuffle8 = shufflevector <4 x i32> %26, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
%27 = bitcast i32* %21 to <8 x i32>*
%28 = load <8 x i32>, <8 x i32>* %27, align 8, !invariant.load !0, !noalias !5
%29 = shufflevector <4 x i1> %14, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
%30 = shufflevector <4 x i32> %16, <4 x i32> %26, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%31 = select <8 x i1> %29, <8 x i32> %28, <8 x i32> %30
%32 = bitcast i8* %22 to <8 x i32>*
store <8 x i32> %31, <8 x i32>* %32, align 8, !alias.scope !10, !noalias !11
%33 = getelementptr inbounds [2 x [3 x [4 x i32]]], [2 x [3 x [4 x i32]]]* %8, i64 0, i64 1, i64 1, i64 0
%34 = getelementptr inbounds i8, i8* %9, i64 64
%35 = bitcast [1 x [4 x i32]]* %5 to <2 x i32>*
%36 = load <2 x i32>, <2 x i32>* %35, align 8, !invariant.load !0, !noalias !5
%37 = icmp eq <2 x i32> %36, zeroinitializer
%38 = load i32, i32* %10, align 8, !invariant.load !0, !noalias !5
%39 = icmp eq i32 %38, 0
%40 = load i32, i32* %11, align 4, !invariant.load !0, !noalias !5
%41 = icmp eq i32 %40, 0
%42 = bitcast i32* %33 to <8 x i32>*
%43 = load <8 x i32>, <8 x i32>* %42, align 8, !invariant.load !0, !noalias !5
%44 = extractelement <2 x i1> %37, i32 0
%45 = insertelement <8 x i1> undef, i1 %44, i32 0
%46 = extractelement <2 x i1> %37, i32 1
%47 = insertelement <8 x i1> %45, i1 %46, i32 1
%48 = insertelement <8 x i1> %47, i1 %39, i32 2
%49 = insertelement <8 x i1> %48, i1 %23, i32 3
%50 = insertelement <8 x i1> %49, i1 %44, i32 4
%51 = insertelement <8 x i1> %50, i1 %46, i32 5
%52 = insertelement <8 x i1> %51, i1 %39, i32 6
%53 = insertelement <8 x i1> %52, i1 %41, i32 7
%54 = select <8 x i1> %53, <8 x i32> %43, <8 x i32> %shuffle8
%55 = bitcast i8* %34 to <8 x i32>*
store <8 x i32> %54, <8 x i32>* %55, align 8, !alias.scope !10, !noalias !11
%56 = getelementptr inbounds i8*, i8** %buffer_table, i64 4
%57 = bitcast i8** %56 to [1 x i8*]**
%58 = load [1 x i8*]*, [1 x i8*]** %57, align 8, !invariant.load !0, !dereferenceable !2, !align !2
%59 = getelementptr inbounds [1 x i8*], [1 x i8*]* %58, i64 0, i64 0
store i8* %9, i8** %59, align 8, !alias.scope !13, !noalias !10
ret void
}
attributes #0 = { nofree norecurse nounwind uwtable "no-frame-pointer-elim"="false" "no-signed-zeros-fp-math"="true" "reciprocal-estimates"="none" "unsafe-fp-math"="true" }
!0 = !{}
!1 = !{i64 32}
!2 = !{i64 8}
!3 = !{i64 16}
!4 = !{i64 96}
!5 = !{!6, !8}
!6 = !{!"buffer: {index:0, offset:0, size:96}", !7}
!7 = !{!"XLA global AA domain"}
!8 = !{!"buffer: {index:2, offset:0, size:32}", !7}
!9 = !{!8}
!10 = !{!6}
!11 = !{!8, !12}
!12 = !{!"buffer: {index:4, offset:0, size:8}", !7}
!13 = !{!12}
$ # This yields wrong results
$ cat broken.s
jaxpr_computation.15:
0x00000000 mov rsi, qword ptr [rcx + 16]
0x00000004 mov rdi, qword ptr [rcx + 24]
0x00000008 mov rax, qword ptr [rcx]
0x0000000b mov rdx, qword ptr [rcx + 8]
0x0000000f vmovdqu xmm0, xmmword ptr [rdi]
0x00000013 vptestnmd k0, xmm0, xmm0
0x00000019 kshiftlb k1, k0, 4
0x0000001f korb k1, k0, k1
0x00000023 vmovdqu xmm0, xmmword ptr [rsi]
0x00000027 vinserti128 ymm1, ymm0, xmm0, 1
0x0000002d vmovdqu32 ymm1 {k1}, ymmword ptr [rdx]
0x00000033 vmovdqu ymmword ptr [rax], ymm1
0x00000037 vmovdqu xmm1, xmmword ptr [rsi + 16]
0x0000003c kshiftrb k0, k0, 3
0x00000042 vinserti128 ymm0, ymm0, xmm1, 1
0x00000048 vmovdqu32 ymm0 {k1}, ymmword ptr [rdx + 32]
0x0000004f vmovdqu ymmword ptr [rax + 32], ymm0
0x00000054 vinserti128 ymm0, ymm1, xmm1, 1
0x0000005a vmovq xmm1, qword ptr [rdi]
0x0000005e vptestnmd k3, xmm1, xmm1
0x00000064 cmp dword ptr [rdi + 8], 0
0x00000068 kshiftrb k2, k3, 1
0x0000006e sete sil
0x00000072 cmp dword ptr [rdi + 12], 0
0x00000076 sete dil
0x0000007a kmovd k1, esi
0x0000007e kshiftlb k4, k0, 7
0x00000084 kshiftrb k4, k4, 6
0x0000008a kxorb k4, k3, k4
0x0000008e kshiftrb k5, k4, 2
0x00000094 kxorb k5, k5, k1
0x00000098 kshiftlb k5, k5, 7
0x0000009e kshiftrb k5, k5, 5
0x000000a4 kxorb k4, k4, k5
0x000000a8 kshiftrb k5, k4, 3
0x000000ae kxorb k0, k5, k0
0x000000b2 kshiftlb k0, k0, 7
0x000000b8 kshiftrb k0, k0, 4
0x000000be kxorb k0, k4, k0
0x000000c2 kshiftrb k4, k0, 4
0x000000c8 kxorb k3, k4, k3
0x000000cc kshiftlb k3, k3, 7
0x000000d2 kshiftrb k3, k3, 3
0x000000d8 kxorb k0, k0, k3
0x000000dc kshiftrb k3, k0, 5
0x000000e2 kxorb k2, k3, k2
0x000000e6 kshiftlb k2, k2, 7
0x000000ec kshiftrb k2, k2, 2
0x000000f2 kxorb k0, k0, k2
0x000000f6 kshiftrb k2, k0, 6
0x000000fc kxorb k1, k2, k1
0x00000100 kshiftlb k1, k1, 6
0x00000106 kxorb k0, k0, k1
0x0000010a kshiftlb k0, k0, 1
0x00000110 kshiftrb k0, k0, 1
0x00000116 kmovd k1, edi
0x0000011a kshiftlb k1, k1, 7
0x00000120 korb k1, k0, k1
0x00000124 vmovdqu32 ymm0 {k1}, ymmword ptr [rdx + 64]
0x0000012b vmovdqu ymmword ptr [rax + 64], ymm0
0x00000130 mov rcx, qword ptr [rcx + 32]
0x00000134 mov qword ptr [rcx], rax
0x00000137 vzeroupper
0x0000013a ret
$ # Now turned off insert1BitVector for the insert at index = 1
$ # This hides the issue
$ cat works.s
jaxpr_computation.15:
0x00000000 mov rsi, qword ptr [rcx + 16]
0x00000004 mov rdi, qword ptr [rcx + 24]
0x00000008 mov rax, qword ptr [rcx]
0x0000000b mov r8, qword ptr [rcx + 8]
0x0000000f vmovdqu xmm0, xmmword ptr [rdi]
0x00000013 vptestnmd k0, xmm0, xmm0
0x00000019 kshiftlb k1, k0, 4
0x0000001f korb k1, k0, k1
0x00000023 vmovdqu xmm0, xmmword ptr [rsi]
0x00000027 vinserti128 ymm1, ymm0, xmm0, 1
0x0000002d vmovdqu32 ymm1 {k1}, ymmword ptr [r8]
0x00000033 kshiftrb k0, k0, 3
0x00000039 vmovdqu ymmword ptr [rax], ymm1
0x0000003d vmovdqu xmm1, xmmword ptr [rsi + 16]
0x00000042 vinserti128 ymm0, ymm0, xmm1, 1
0x00000048 vmovdqu32 ymm0 {k1}, ymmword ptr [r8 + 32]
0x0000004f vmovdqu ymmword ptr [rax + 32], ymm0
0x00000054 vmovq xmm0, qword ptr [rdi]
0x00000058 vinserti128 ymm1, ymm1, xmm1, 1
0x0000005e vptestnmd k2, xmm0, xmm0
0x00000064 kshiftrb k1, k2, 1
0x0000006a kmovd esi, k1
0x0000006e cmp dword ptr [rdi + 8], 0
0x00000072 sete dl
0x00000075 cmp dword ptr [rdi + 12], 0
0x00000079 sete dil
0x0000007d kmovd k1, edx
0x00000081 vpmovm2w xmm0, k2
0x00000087 vpinsrw xmm0, xmm0, esi, 1
0x0000008c vpsllw xmm0, xmm0, 15
0x00000091 vpmovw2m k3, xmm0
0x00000097 kshiftrb k4, k3, 2
0x0000009d kxorb k4, k4, k1
0x000000a1 kshiftlb k4, k4, 7
0x000000a7 kshiftrb k4, k4, 5
0x000000ad kxorb k3, k3, k4
0x000000b1 kshiftrb k4, k3, 3
0x000000b7 kxorb k0, k4, k0
0x000000bb kshiftlb k0, k0, 7
0x000000c1 kshiftrb k0, k0, 4
0x000000c7 kxorb k0, k3, k0
0x000000cb kshiftrb k3, k0, 4
0x000000d1 kxorb k2, k3, k2
0x000000d5 kshiftlb k2, k2, 7
0x000000db kshiftrb k2, k2, 3
0x000000e1 kxorb k0, k0, k2
0x000000e5 kshiftrb k2, k0, 5
0x000000eb kmovd k3, esi
0x000000ef kxorb k2, k2, k3
0x000000f3 kshiftlb k2, k2, 7
0x000000f9 kshiftrb k2, k2, 2
0x000000ff kxorb k0, k0, k2
0x00000103 kshiftrb k2, k0, 6
0x00000109 kxorb k1, k2, k1
0x0000010d kshiftlb k1, k1, 6
0x00000113 kxorb k0, k0, k1
0x00000117 kshiftlb k0, k0, 1
0x0000011d kshiftrb k0, k0, 1
0x00000123 kmovd k1, edi
0x00000127 kshiftlb k1, k1, 7
0x0000012d korb k1, k0, k1
0x00000131 vmovdqu32 ymm1 {k1}, ymmword ptr [r8 + 64]
0x00000138 vmovdqu ymmword ptr [rax + 64], ymm1
0x0000013d mov rcx, qword ptr [rcx + 32]
0x00000141 mov qword ptr [rcx], rax
0x00000144 vzeroupper
0x00000147 ret

Event Timeline

bkramer created this paste.Oct 1 2019, 12:03 PM
bkramer changed the visibility from "All Users" to "Public (No Login Required)".