HomePhabricator

Revert "[DependenceAnalysis] Dependecies for loads marked with "ivnariant.load"…

Authored by bkramer on Nov 21 2019, 2:35 AM.

Description

Revert "[DependenceAnalysis] Dependecies for loads marked with "ivnariant.load" should not be shared with general accesses. Fix for https://bugs.llvm.org/show_bug.cgi?id=42151"

Summary:
Revert "[DependenceAnalysis] Dependecies for loads marked with "ivnariant.load" should not be shared with general accesses. Fix for https://bugs.llvm.org/show_bug.cgi?id=42151"

This reverts commit 5f026b6d9e882941fde9b7e5dc0a2d807f7f24f5.

We're (tensorflow.org/xla team) seeing some misscompiles with the new change, only at -O3, with fast math disabled.

I'm still trying to come up with a useful/small/external example, but for now, the following IR:

; ModuleID = '__compute_module'
source_filename = "__compute_module"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"

@0 = private unnamed_addr constant [4 x i8] c"\DB\0F\C9@"
@1 = private unnamed_addr constant [4 x i8] c"\00\00\00?"

; Function Attrs: uwtable
define void @jit_wrapped_fun.31(i8* %retval, i8* noalias %run_options, i8** noalias %params, i8** noalias %buffer_table, i64* noalias %prof_counters) #0 {
entry:
  %fusion.invar_address.dim.2 = alloca i64
  %fusion.invar_address.dim.1 = alloca i64
  %fusion.invar_address.dim.0 = alloca i64
  %fusion.1.invar_address.dim.2 = alloca i64
  %fusion.1.invar_address.dim.1 = alloca i64
  %fusion.1.invar_address.dim.0 = alloca i64
  %0 = getelementptr inbounds i8*, i8** %buffer_table, i64 1
  %1 = load i8*, i8** %0, !invariant.load !0, !dereferenceable !1, !align !2
  %parameter.3 = bitcast i8* %1 to [2 x [1 x [4 x float]]]*
  %2 = getelementptr inbounds i8*, i8** %buffer_table, i64 5
  %3 = load i8*, i8** %2, !invariant.load !0, !dereferenceable !1, !align !2
  %fusion.1 = bitcast i8* %3 to [2 x [1 x [4 x float]]]*
  store i64 0, i64* %fusion.1.invar_address.dim.0
  br label %fusion.1.loop_header.dim.0

fusion.1.loop_header.dim.0:                       ; preds = %fusion.1.loop_exit.dim.1, %entry
  %fusion.1.indvar.dim.0 = load i64, i64* %fusion.1.invar_address.dim.0
  %4 = icmp uge i64 %fusion.1.indvar.dim.0, 2
  br i1 %4, label %fusion.1.loop_exit.dim.0, label %fusion.1.loop_body.dim.0

fusion.1.loop_body.dim.0:                         ; preds = %fusion.1.loop_header.dim.0
  store i64 0, i64* %fusion.1.invar_address.dim.1
  br label %fusion.1.loop_header.dim.1

fusion.1.loop_header.dim.1:                       ; preds = %fusion.1.loop_exit.dim.2, %fusion.1.loop_body.dim.0
  %fusion.1.indvar.dim.1 = load i64, i64* %fusion.1.invar_address.dim.1
  %5 = icmp uge i64 %fusion.1.indvar.dim.1, 1
  br i1 %5, label %fusion.1.loop_exit.dim.1, label %fusion.1.loop_body.dim.1

fusion.1.loop_body.dim.1:                         ; preds = %fusion.1.loop_header.dim.1
  store i64 0, i64* %fusion.1.invar_address.dim.2
  br label %fusion.1.loop_header.dim.2

fusion.1.loop_header.dim.2:                       ; preds = %fusion.1.loop_body.dim.2, %fusion.1.loop_body.dim.1
  %fusion.1.indvar.dim.2 = load i64, i64* %fusion.1.invar_address.dim.2
  %6 = icmp uge i64 %fusion.1.indvar.dim.2, 4
  br i1 %6, label %fusion.1.loop_exit.dim.2, label %fusion.1.loop_body.dim.2

fusion.1.loop_body.dim.2:                         ; preds = %fusion.1.loop_header.dim.2
  %7 = load float, float* bitcast ([4 x i8]* @0 to float*)
  %8 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %fusion.1.indvar.dim.0, i64 0, i64 %fusion.1.indvar.dim.2
  %9 = load float, float* %8, !invariant.load !0, !noalias !3
  %10 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %fusion.1.indvar.dim.0, i64 0, i64 %fusion.1.indvar.dim.2
  %11 = load float, float* %10, !invariant.load !0, !noalias !3
  %12 = fmul float %9, %11
  %13 = fmul float %7, %12
  %14 = call float @llvm.log.f32(float %13)
  %15 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %fusion.1, i64 0, i64 %fusion.1.indvar.dim.0, i64 0, i64 %fusion.1.indvar.dim.2
  store float %14, float* %15, !alias.scope !7, !noalias !8
  %invar.inc2 = add nuw nsw i64 %fusion.1.indvar.dim.2, 1
  store i64 %invar.inc2, i64* %fusion.1.invar_address.dim.2
  br label %fusion.1.loop_header.dim.2

fusion.1.loop_exit.dim.2:                         ; preds = %fusion.1.loop_header.dim.2
  %invar.inc1 = add nuw nsw i64 %fusion.1.indvar.dim.1, 1
  store i64 %invar.inc1, i64* %fusion.1.invar_address.dim.1
  br label %fusion.1.loop_header.dim.1

fusion.1.loop_exit.dim.1:                         ; preds = %fusion.1.loop_header.dim.1
  %invar.inc = add nuw nsw i64 %fusion.1.indvar.dim.0, 1
  store i64 %invar.inc, i64* %fusion.1.invar_address.dim.0
  br label %fusion.1.loop_header.dim.0

fusion.1.loop_exit.dim.0:                         ; preds = %fusion.1.loop_header.dim.0
  %16 = getelementptr inbounds i8*, i8** %buffer_table, i64 4
  %17 = load i8*, i8** %16, !invariant.load !0, !dereferenceable !9, !align !2
  %parameter.1 = bitcast i8* %17 to float*
  %18 = getelementptr inbounds i8*, i8** %buffer_table, i64 2
  %19 = load i8*, i8** %18, !invariant.load !0, !dereferenceable !10, !align !2
  %parameter.2 = bitcast i8* %19 to [3 x [1 x float]]*
  %20 = getelementptr inbounds i8*, i8** %buffer_table, i64 0
  %21 = load i8*, i8** %20, !invariant.load !0, !dereferenceable !11, !align !2
  %fusion = bitcast i8* %21 to [2 x [3 x [4 x float]]]*
  store i64 0, i64* %fusion.invar_address.dim.0
  br label %fusion.loop_header.dim.0

fusion.loop_header.dim.0:                         ; preds = %fusion.loop_exit.dim.1, %fusion.1.loop_exit.dim.0
  %fusion.indvar.dim.0 = load i64, i64* %fusion.invar_address.dim.0
  %22 = icmp uge i64 %fusion.indvar.dim.0, 2
  br i1 %22, label %fusion.loop_exit.dim.0, label %fusion.loop_body.dim.0

fusion.loop_body.dim.0:                           ; preds = %fusion.loop_header.dim.0
  store i64 0, i64* %fusion.invar_address.dim.1
  br label %fusion.loop_header.dim.1

fusion.loop_header.dim.1:                         ; preds = %fusion.loop_exit.dim.2, %fusion.loop_body.dim.0
  %fusion.indvar.dim.1 = load i64, i64* %fusion.invar_address.dim.1
  %23 = icmp uge i64 %fusion.indvar.dim.1, 3
  br i1 %23, label %fusion.loop_exit.dim.1, label %fusion.loop_body.dim.1

fusion.loop_body.dim.1:                           ; preds = %fusion.loop_header.dim.1
  store i64 0, i64* %fusion.invar_address.dim.2
  br label %fusion.loop_header.dim.2

fusion.loop_header.dim.2:                         ; preds = %fusion.loop_body.dim.2, %fusion.loop_body.dim.1
  %fusion.indvar.dim.2 = load i64, i64* %fusion.invar_address.dim.2
  %24 = icmp uge i64 %fusion.indvar.dim.2, 4
  br i1 %24, label %fusion.loop_exit.dim.2, label %fusion.loop_body.dim.2

fusion.loop_body.dim.2:                           ; preds = %fusion.loop_header.dim.2
  %25 = mul nuw nsw i64 %fusion.indvar.dim.2, 1
  %26 = add nuw nsw i64 0, %25
  %27 = udiv i64 %26, 4
  %28 = mul nuw nsw i64 %fusion.indvar.dim.0, 1
  %29 = add nuw nsw i64 0, %28
  %30 = udiv i64 %29, 2
  %31 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %fusion.1, i64 0, i64 %29, i64 0, i64 %26
  %32 = load float, float* %31, !alias.scope !7, !noalias !8
  %33 = mul nuw nsw i64 %fusion.indvar.dim.1, 1
  %34 = add nuw nsw i64 0, %33
  %35 = udiv i64 %34, 3
  %36 = load float, float* %parameter.1, !invariant.load !0, !noalias !3
  %37 = getelementptr inbounds [3 x [1 x float]], [3 x [1 x float]]* %parameter.2, i64 0, i64 %34, i64 0
  %38 = load float, float* %37, !invariant.load !0, !noalias !3
  %39 = fsub float %36, %38
  %40 = fmul float %39, %39
  %41 = mul nuw nsw i64 %fusion.indvar.dim.2, 1
  %42 = add nuw nsw i64 0, %41
  %43 = udiv i64 %42, 4
  %44 = mul nuw nsw i64 %fusion.indvar.dim.0, 1
  %45 = add nuw nsw i64 0, %44
  %46 = udiv i64 %45, 2
  %47 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %45, i64 0, i64 %42
  %48 = load float, float* %47, !invariant.load !0, !noalias !3
  %49 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %45, i64 0, i64 %42
  %50 = load float, float* %49, !invariant.load !0, !noalias !3
  %51 = fmul float %48, %50
  %52 = fdiv float %40, %51
  %53 = fadd float %32, %52
  %54 = fneg float %53
  %55 = load float, float* bitcast ([4 x i8]* @1 to float*)
  %56 = fmul float %54, %55
  %57 = getelementptr inbounds [2 x [3 x [4 x float]]], [2 x [3 x [4 x float]]]* %fusion, i64 0, i64 %fusion.indvar.dim.0, i64 %fusion.indvar.dim.1, i64 %fusion.indvar.dim.2
  store float %56, float* %57, !alias.scope !8, !noalias !12
  %invar.inc5 = add nuw nsw i64 %fusion.indvar.dim.2, 1
  store i64 %invar.inc5, i64* %fusion.invar_address.dim.2
  br label %fusion.loop_header.dim.2

fusion.loop_exit.dim.2:                           ; preds = %fusion.loop_header.dim.2
  %invar.inc4 = add nuw nsw i64 %fusion.indvar.dim.1, 1
  store i64 %invar.inc4, i64* %fusion.invar_address.dim.1
  br label %fusion.loop_header.dim.1

fusion.loop_exit.dim.1:                           ; preds = %fusion.loop_header.dim.1
  %invar.inc3 = add nuw nsw i64 %fusion.indvar.dim.0, 1
  store i64 %invar.inc3, i64* %fusion.invar_address.dim.0
  br label %fusion.loop_header.dim.0

fusion.loop_exit.dim.0:                           ; preds = %fusion.loop_header.dim.0
  %58 = getelementptr inbounds i8*, i8** %buffer_table, i64 3
  %59 = load i8*, i8** %58, !invariant.load !0, !dereferenceable !2, !align !2
  %tuple.30 = bitcast i8* %59 to [1 x i8*]*
  %60 = bitcast [2 x [3 x [4 x float]]]* %fusion to i8*
  %61 = getelementptr inbounds [1 x i8*], [1 x i8*]* %tuple.30, i64 0, i64 0
  store i8* %60, i8** %61, !alias.scope !14, !noalias !8
  ret void
}

; Function Attrs: nounwind readnone speculatable willreturn
declare float @llvm.log.f32(float) #1

attributes #0 = { uwtable "no-frame-pointer-elim"="false" }
attributes #1 = { nounwind readnone speculatable willreturn }

!0 = !{}
!1 = !{i64 32}
!2 = !{i64 8}
!3 = !{!4, !6}
!4 = !{!"buffer: {index:0, offset:0, size:96}", !5}
!5 = !{!"XLA global AA domain"}
!6 = !{!"buffer: {index:5, offset:0, size:32}", !5}
!7 = !{!6}
!8 = !{!4}
!9 = !{i64 4}
!10 = !{i64 12}
!11 = !{i64 96}
!12 = !{!13, !6}
!13 = !{!"buffer: {index:3, offset:0, size:8}", !5}
!14 = !{!13}

gets (correctly) optimized to the one below without the change:

; ModuleID = '__compute_module'
source_filename = "__compute_module"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"

; Function Attrs: nofree nounwind uwtable
define void @jit_wrapped_fun.31(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
entry:
  %0 = getelementptr inbounds i8*, i8** %buffer_table, i64 1
  %1 = bitcast i8** %0 to [2 x [1 x [4 x float]]]**
  %2 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %1, align 8, !invariant.load !0, !dereferenceable !1, !align !2
  %3 = getelementptr inbounds i8*, i8** %buffer_table, i64 5
  %4 = bitcast i8** %3 to [2 x [1 x [4 x float]]]**
  %5 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %4, align 8, !invariant.load !0, !dereferenceable !1, !align !2
  %6 = bitcast [2 x [1 x [4 x float]]]* %2 to <4 x float>*
  %7 = load <4 x float>, <4 x float>* %6, align 8, !invariant.load !0, !noalias !3
  %8 = fmul <4 x float> %7, %7
  %9 = fmul <4 x float> %8, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000>
  %10 = call <4 x float> @llvm.log.v4f32(<4 x float> %9)
  %11 = bitcast [2 x [1 x [4 x float]]]* %5 to <4 x float>*
  store <4 x float> %10, <4 x float>* %11, align 8, !alias.scope !7, !noalias !8
  %12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0
  %13 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0
  %14 = bitcast float* %12 to <4 x float>*
  %15 = load <4 x float>, <4 x float>* %14, align 8, !invariant.load !0, !noalias !3
  %16 = fmul <4 x float> %15, %15
  %17 = fmul <4 x float> %16, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000>
  %18 = call <4 x float> @llvm.log.v4f32(<4 x float> %17)
  %19 = bitcast float* %13 to <4 x float>*
  store <4 x float> %18, <4 x float>* %19, align 8, !alias.scope !7, !noalias !8
  %20 = getelementptr inbounds i8*, i8** %buffer_table, i64 4
  %21 = bitcast i8** %20 to float**
  %22 = load float*, float** %21, align 8, !invariant.load !0, !dereferenceable !9, !align !2
  %23 = getelementptr inbounds i8*, i8** %buffer_table, i64 2
  %24 = bitcast i8** %23 to [3 x [1 x float]]**
  %25 = load [3 x [1 x float]]*, [3 x [1 x float]]** %24, align 8, !invariant.load !0, !dereferenceable !10, !align !2
  %26 = load i8*, i8** %buffer_table, align 8, !invariant.load !0, !dereferenceable !11, !align !2
  %27 = load float, float* %22, align 8, !invariant.load !0, !noalias !3
  %.phi.trans.insert28 = getelementptr inbounds [3 x [1 x float]], [3 x [1 x float]]* %25, i64 0, i64 2, i64 0
  %.pre29 = load float, float* %.phi.trans.insert28, align 8, !invariant.load !0, !noalias !3
  %28 = bitcast [3 x [1 x float]]* %25 to <2 x float>*
  %29 = load <2 x float>, <2 x float>* %28, align 8, !invariant.load !0, !noalias !3
  %30 = insertelement <2 x float> undef, float %27, i32 0
  %31 = shufflevector <2 x float> %30, <2 x float> undef, <2 x i32> zeroinitializer
  %32 = fsub <2 x float> %31, %29
  %33 = fmul <2 x float> %32, %32
  %shuffle30 = shufflevector <2 x float> %33, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
  %34 = fsub float %27, %.pre29
  %35 = fmul float %34, %34
  %36 = insertelement <4 x float> undef, float %35, i32 0
  %37 = shufflevector <4 x float> %36, <4 x float> undef, <4 x i32> zeroinitializer
  %shuffle = shufflevector <4 x float> %10, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  %38 = fmul <4 x float> %7, %7
  %shuffle31 = shufflevector <4 x float> %38, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  %39 = fdiv <8 x float> %shuffle30, %shuffle31
  %40 = fadd <8 x float> %shuffle, %39
  %41 = fmul <8 x float> %40, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
  %42 = bitcast i8* %26 to <8 x float>*
  store <8 x float> %41, <8 x float>* %42, align 8, !alias.scope !8, !noalias !12
  %43 = getelementptr inbounds i8, i8* %26, i64 32
  %44 = fdiv <4 x float> %37, %38
  %45 = fadd <4 x float> %10, %44
  %46 = fmul <4 x float> %45, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
  %47 = bitcast i8* %43 to <4 x float>*
  store <4 x float> %46, <4 x float>* %47, align 8, !alias.scope !8, !noalias !12
  %.phi.trans.insert = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0
  %.phi.trans.insert12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0
  %48 = bitcast float* %.phi.trans.insert to <4 x float>*
  %49 = load <4 x float>, <4 x float>* %48, align 8, !alias.scope !7, !noalias !8
  %50 = bitcast float* %.phi.trans.insert12 to <4 x float>*
  %51 = load <4 x float>, <4 x float>* %50, align 8, !invariant.load !0, !noalias !3
  %shuffle.1 = shufflevector <4 x float> %49, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  %52 = getelementptr inbounds i8, i8* %26, i64 48
  %53 = fmul <4 x float> %51, %51
  %shuffle31.1 = shufflevector <4 x float> %53, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  %54 = fdiv <8 x float> %shuffle30, %shuffle31.1
  %55 = fadd <8 x float> %shuffle.1, %54
  %56 = fmul <8 x float> %55, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
  %57 = bitcast i8* %52 to <8 x float>*
  store <8 x float> %56, <8 x float>* %57, align 8, !alias.scope !8, !noalias !12
  %58 = getelementptr inbounds i8, i8* %26, i64 80
  %59 = fdiv <4 x float> %37, %53
  %60 = fadd <4 x float> %49, %59
  %61 = fmul <4 x float> %60, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
  %62 = bitcast i8* %58 to <4 x float>*
  store <4 x float> %61, <4 x float>* %62, align 8, !alias.scope !8, !noalias !12
  %63 = getelementptr inbounds i8*, i8** %buffer_table, i64 3
  %64 = bitcast i8** %63 to [1 x i8*]**
  %65 = load [1 x i8*]*, [1 x i8*]** %64, align 8, !invariant.load !0, !dereferenceable !2, !align !2
  %66 = getelementptr inbounds [1 x i8*], [1 x i8*]* %65, i64 0, i64 0
  store i8* %26, i8** %66, align 8, !alias.scope !14, !noalias !8
  ret void
}

; Function Attrs: nounwind readnone speculatable willreturn
declare <4 x float> @llvm.log.v4f32(<4 x float>) #1

attributes #0 = { nofree nounwind uwtable "no-frame-pointer-elim"="false" }
attributes #1 = { nounwind readnone speculatable willreturn }

!0 = !{}
!1 = !{i64 32}
!2 = !{i64 8}
!3 = !{!4, !6}
!4 = !{!"buffer: {index:0, offset:0, size:96}", !5}
!5 = !{!"XLA global AA domain"}
!6 = !{!"buffer: {index:5, offset:0, size:32}", !5}
!7 = !{!6}
!8 = !{!4}
!9 = !{i64 4}
!10 = !{i64 12}
!11 = !{i64 96}
!12 = !{!13, !6}
!13 = !{!"buffer: {index:3, offset:0, size:8}", !5}
!14 = !{!13}

and (incorrectly) optimized to the one below with the change:

; ModuleID = '__compute_module'
source_filename = "__compute_module"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"

; Function Attrs: nofree nounwind uwtable
define void @jit_wrapped_fun.31(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
entry:
  %0 = getelementptr inbounds i8*, i8** %buffer_table, i64 1
  %1 = bitcast i8** %0 to [2 x [1 x [4 x float]]]**
  %2 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %1, align 8, !invariant.load !0, !dereferenceable !1, !align !2
  %3 = getelementptr inbounds i8*, i8** %buffer_table, i64 5
  %4 = bitcast i8** %3 to [2 x [1 x [4 x float]]]**
  %5 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %4, align 8, !invariant.load !0, !dereferenceable !1, !align !2
  %6 = bitcast [2 x [1 x [4 x float]]]* %2 to <4 x float>*
  %7 = load <4 x float>, <4 x float>* %6, align 8, !invariant.load !0, !noalias !3
  %8 = fmul <4 x float> %7, %7
  %9 = fmul <4 x float> %8, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000>
  %10 = call <4 x float> @llvm.log.v4f32(<4 x float> %9)
  %11 = bitcast [2 x [1 x [4 x float]]]* %5 to <4 x float>*
  store <4 x float> %10, <4 x float>* %11, align 8, !alias.scope !7, !noalias !8
  %12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0
  %13 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0
  %14 = bitcast float* %12 to <4 x float>*
  %15 = load <4 x float>, <4 x float>* %14, align 8, !invariant.load !0, !noalias !3
  %16 = fmul <4 x float> %15, %15
  %17 = fmul <4 x float> %16, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000>
  %18 = call <4 x float> @llvm.log.v4f32(<4 x float> %17)
  %19 = bitcast float* %13 to <4 x float>*
  store <4 x float> %18, <4 x float>* %19, align 8, !alias.scope !7, !noalias !8
  %20 = getelementptr inbounds i8*, i8** %buffer_table, i64 4
  %21 = bitcast i8** %20 to float**
  %22 = load float*, float** %21, align 8, !invariant.load !0, !dereferenceable !9, !align !2
  %23 = getelementptr inbounds i8*, i8** %buffer_table, i64 2
  %24 = bitcast i8** %23 to [3 x [1 x float]]**
  %25 = load [3 x [1 x float]]*, [3 x [1 x float]]** %24, align 8, !invariant.load !0, !dereferenceable !10, !align !2
  %26 = load i8*, i8** %buffer_table, align 8, !invariant.load !0, !dereferenceable !11, !align !2
  %27 = load float, float* %22, align 8, !invariant.load !0, !noalias !3
  %.phi.trans.insert28 = getelementptr inbounds [3 x [1 x float]], [3 x [1 x float]]* %25, i64 0, i64 2, i64 0
  %.pre29 = load float, float* %.phi.trans.insert28, align 8, !invariant.load !0, !noalias !3
  %28 = bitcast [3 x [1 x float]]* %25 to <2 x float>*
  %29 = load <2 x float>, <2 x float>* %28, align 8, !invariant.load !0, !noalias !3
  %30 = insertelement <2 x float> undef, float %27, i32 0
  %31 = shufflevector <2 x float> %30, <2 x float> undef, <2 x i32> zeroinitializer
  %32 = fsub <2 x float> %31, %29
  %33 = fmul <2 x float> %32, %32
  %shuffle32 = shufflevector <2 x float> %33, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
  %34 = fsub float %27, %.pre29
  %35 = fmul float %34, %34
  %36 = insertelement <4 x float> undef, float %35, i32 0
  %37 = shufflevector <4 x float> %36, <4 x float> undef, <4 x i32> zeroinitializer
  %shuffle = shufflevector <4 x float> %10, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  %38 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 0, i64 0, i64 3
  %39 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 0, i64 0, i64 3
  %40 = fmul <4 x float> %7, %7
  %41 = shufflevector <4 x float> %40, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %42 = fdiv <8 x float> %shuffle32, %41
  %43 = fadd <8 x float> %shuffle, %42
  %44 = fmul <8 x float> %43, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
  %45 = bitcast i8* %26 to <8 x float>*
  store <8 x float> %44, <8 x float>* %45, align 8, !alias.scope !8, !noalias !12
  %46 = extractelement <4 x float> %10, i32 0
  %47 = getelementptr inbounds i8, i8* %26, i64 32
  %48 = extractelement <4 x float> %10, i32 1
  %49 = extractelement <4 x float> %10, i32 2
  %50 = load float, float* %38, align 4, !alias.scope !7, !noalias !8
  %51 = load float, float* %39, align 4, !invariant.load !0, !noalias !3
  %52 = fmul float %51, %51
  %53 = insertelement <4 x float> undef, float %52, i32 3
  %54 = fdiv <4 x float> %37, %53
  %55 = insertelement <4 x float> undef, float %46, i32 0
  %56 = insertelement <4 x float> %55, float %48, i32 1
  %57 = insertelement <4 x float> %56, float %49, i32 2
  %58 = insertelement <4 x float> %57, float %50, i32 3
  %59 = fadd <4 x float> %58, %54
  %60 = fmul <4 x float> %59, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
  %61 = bitcast i8* %47 to <4 x float>*
  store <4 x float> %60, <4 x float>* %61, align 8, !alias.scope !8, !noalias !12
  %.phi.trans.insert = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0
  %.phi.trans.insert12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0
  %62 = bitcast float* %.phi.trans.insert to <4 x float>*
  %63 = load <4 x float>, <4 x float>* %62, align 8, !alias.scope !7, !noalias !8
  %64 = bitcast float* %.phi.trans.insert12 to <4 x float>*
  %65 = load <4 x float>, <4 x float>* %64, align 8, !invariant.load !0, !noalias !3
  %shuffle.1 = shufflevector <4 x float> %63, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  %66 = getelementptr inbounds i8, i8* %26, i64 48
  %67 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 3
  %68 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 3
  %69 = fmul <4 x float> %65, %65
  %70 = shufflevector <4 x float> %69, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  %71 = fdiv <8 x float> %shuffle32, %70
  %72 = fadd <8 x float> %shuffle.1, %71
  %73 = fmul <8 x float> %72, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
  %74 = bitcast i8* %66 to <8 x float>*
  store <8 x float> %73, <8 x float>* %74, align 8, !alias.scope !8, !noalias !12
  %75 = extractelement <4 x float> %69, i32 0
  %76 = extractelement <4 x float> %63, i32 0
  %77 = getelementptr inbounds i8, i8* %26, i64 80
  %78 = extractelement <4 x float> %69, i32 1
  %79 = extractelement <4 x float> %63, i32 1
  %80 = extractelement <4 x float> %69, i32 2
  %81 = extractelement <4 x float> %63, i32 2
  %82 = load float, float* %67, align 4, !alias.scope !7, !noalias !8
  %83 = load float, float* %68, align 4, !invariant.load !0, !noalias !3
  %84 = fmul float %83, %83
  %85 = insertelement <4 x float> undef, float %75, i32 0
  %86 = insertelement <4 x float> %85, float %78, i32 1
  %87 = insertelement <4 x float> %86, float %80, i32 2
  %88 = insertelement <4 x float> %87, float %84, i32 3
  %89 = fdiv <4 x float> %37, %88
  %90 = insertelement <4 x float> undef, float %76, i32 0
  %91 = insertelement <4 x float> %90, float %79, i32 1
  %92 = insertelement <4 x float> %91, float %81, i32 2
  %93 = insertelement <4 x float> %92, float %82, i32 3
  %94 = fadd <4 x float> %93, %89
  %95 = fmul <4 x float> %94, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
  %96 = bitcast i8* %77 to <4 x float>*
  store <4 x float> %95, <4 x float>* %96, align 8, !alias.scope !8, !noalias !12
  %97 = getelementptr inbounds i8*, i8** %buffer_table, i64 3
  %98 = bitcast i8** %97 to [1 x i8*]**
  %99 = load [1 x i8*]*, [1 x i8*]** %98, align 8, !invariant.load !0, !dereferenceable !2, !align !2
  %100 = getelementptr inbounds [1 x i8*], [1 x i8*]* %99, i64 0, i64 0
  store i8* %26, i8** %100, align 8, !alias.scope !14, !noalias !8
  ret void
}

; Function Attrs: nounwind readnone speculatable willreturn
declare <4 x float> @llvm.log.v4f32(<4 x float>) #1

attributes #0 = { nofree nounwind uwtable "no-frame-pointer-elim"="false" }
attributes #1 = { nounwind readnone speculatable willreturn }

!0 = !{}
!1 = !{i64 32}
!2 = !{i64 8}
!3 = !{!4, !6}
!4 = !{!"buffer: {index:0, offset:0, size:96}", !5}
!5 = !{!"XLA global AA domain"}
!6 = !{!"buffer: {index:5, offset:0, size:32}", !5}
!7 = !{!6}
!8 = !{!4}
!9 = !{i64 4}
!10 = !{i64 12}
!11 = !{i64 96}
!12 = !{!13, !6}
!13 = !{!"buffer: {index:3, offset:0, size:8}", !5}
!14 = !{!13}

This results in bad numerical answers when used through XLA.
Again, it's not that easy to give a small fully-reproducible example, but the misscompare is:

Expected literal:
(
f32[2,3,4] {
{
  { nan, -inf, -3181.35, -inf },
  { nan, -inf, -28.2577019, -inf },
  { nan, -inf, -28.2577019, -inf }
},
{
  { -inf, -inf, -inf, -inf },
  { -6.60753046e+28, -1.47314833e+23, -inf, -inf },
  { -2.43504347e+30, -5.42892693e+24, -inf, -inf }
}
}
)

Actual literal:
(
f32[2,3,4] {
{
  { nan, -inf, -3181.35, -inf },
  { nan, -inf, -inf, -inf },
  { inf, -inf, -28.2577019, -inf }
},
{
  { -inf, -inf, -inf, -inf },
  { -6.60753046e+28, -1.47314833e+23, -inf, -inf },
  { -2.43504347e+30, -5.42892693e+24, -inf, -inf }
}
}
)

Reviewers: sanjoy.google, sanjoy, ebrevnov, jdoerfert, reames, chandlerc

Subscribers: hiraditya, Charusso, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D70516