Page MenuHomePhabricator

Revert "[DependenceAnalysis] Dependecies for loads marked with "ivnariant.load" should not be shared with general accesses. Fix for https://bugs.llvm.org/show_bug.cgi?id=42151"
ClosedPublic

Authored by george.karpenkov on Wed, Nov 20, 3:23 PM.

Details

Summary

Revert "[DependenceAnalysis] Dependecies for loads marked with "ivnariant.load" should not be shared with general accesses. Fix for https://bugs.llvm.org/show_bug.cgi?id=42151"

This reverts commit 5f026b6d9e882941fde9b7e5dc0a2d807f7f24f5.

We're (tensorflow.org/xla team) seeing some misscompiles with the new change, only at -O3, with fast math disabled.

I'm still trying to come up with a useful/small/external example, but for now, the following IR:

; ModuleID = '__compute_module'
source_filename = "__compute_module"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"

@0 = private unnamed_addr constant [4 x i8] c"\DB\0F\C9@"
@1 = private unnamed_addr constant [4 x i8] c"\00\00\00?"

; Function Attrs: uwtable
define void @jit_wrapped_fun.31(i8* %retval, i8* noalias %run_options, i8** noalias %params, i8** noalias %buffer_table, i64* noalias %prof_counters) #0 {
entry:
  %fusion.invar_address.dim.2 = alloca i64
  %fusion.invar_address.dim.1 = alloca i64
  %fusion.invar_address.dim.0 = alloca i64
  %fusion.1.invar_address.dim.2 = alloca i64
  %fusion.1.invar_address.dim.1 = alloca i64
  %fusion.1.invar_address.dim.0 = alloca i64
  %0 = getelementptr inbounds i8*, i8** %buffer_table, i64 1
  %1 = load i8*, i8** %0, !invariant.load !0, !dereferenceable !1, !align !2
  %parameter.3 = bitcast i8* %1 to [2 x [1 x [4 x float]]]*
  %2 = getelementptr inbounds i8*, i8** %buffer_table, i64 5
  %3 = load i8*, i8** %2, !invariant.load !0, !dereferenceable !1, !align !2
  %fusion.1 = bitcast i8* %3 to [2 x [1 x [4 x float]]]*
  store i64 0, i64* %fusion.1.invar_address.dim.0
  br label %fusion.1.loop_header.dim.0

fusion.1.loop_header.dim.0:                       ; preds = %fusion.1.loop_exit.dim.1, %entry
  %fusion.1.indvar.dim.0 = load i64, i64* %fusion.1.invar_address.dim.0
  %4 = icmp uge i64 %fusion.1.indvar.dim.0, 2
  br i1 %4, label %fusion.1.loop_exit.dim.0, label %fusion.1.loop_body.dim.0

fusion.1.loop_body.dim.0:                         ; preds = %fusion.1.loop_header.dim.0
  store i64 0, i64* %fusion.1.invar_address.dim.1
  br label %fusion.1.loop_header.dim.1

fusion.1.loop_header.dim.1:                       ; preds = %fusion.1.loop_exit.dim.2, %fusion.1.loop_body.dim.0
  %fusion.1.indvar.dim.1 = load i64, i64* %fusion.1.invar_address.dim.1
  %5 = icmp uge i64 %fusion.1.indvar.dim.1, 1
  br i1 %5, label %fusion.1.loop_exit.dim.1, label %fusion.1.loop_body.dim.1

fusion.1.loop_body.dim.1:                         ; preds = %fusion.1.loop_header.dim.1
  store i64 0, i64* %fusion.1.invar_address.dim.2
  br label %fusion.1.loop_header.dim.2

fusion.1.loop_header.dim.2:                       ; preds = %fusion.1.loop_body.dim.2, %fusion.1.loop_body.dim.1
  %fusion.1.indvar.dim.2 = load i64, i64* %fusion.1.invar_address.dim.2
  %6 = icmp uge i64 %fusion.1.indvar.dim.2, 4
  br i1 %6, label %fusion.1.loop_exit.dim.2, label %fusion.1.loop_body.dim.2

fusion.1.loop_body.dim.2:                         ; preds = %fusion.1.loop_header.dim.2
  %7 = load float, float* bitcast ([4 x i8]* @0 to float*)
  %8 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %fusion.1.indvar.dim.0, i64 0, i64 %fusion.1.indvar.dim.2
  %9 = load float, float* %8, !invariant.load !0, !noalias !3
  %10 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %fusion.1.indvar.dim.0, i64 0, i64 %fusion.1.indvar.dim.2
  %11 = load float, float* %10, !invariant.load !0, !noalias !3
  %12 = fmul float %9, %11
  %13 = fmul float %7, %12
  %14 = call float @llvm.log.f32(float %13)
  %15 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %fusion.1, i64 0, i64 %fusion.1.indvar.dim.0, i64 0, i64 %fusion.1.indvar.dim.2
  store float %14, float* %15, !alias.scope !7, !noalias !8
  %invar.inc2 = add nuw nsw i64 %fusion.1.indvar.dim.2, 1
  store i64 %invar.inc2, i64* %fusion.1.invar_address.dim.2
  br label %fusion.1.loop_header.dim.2

fusion.1.loop_exit.dim.2:                         ; preds = %fusion.1.loop_header.dim.2
  %invar.inc1 = add nuw nsw i64 %fusion.1.indvar.dim.1, 1
  store i64 %invar.inc1, i64* %fusion.1.invar_address.dim.1
  br label %fusion.1.loop_header.dim.1

fusion.1.loop_exit.dim.1:                         ; preds = %fusion.1.loop_header.dim.1
  %invar.inc = add nuw nsw i64 %fusion.1.indvar.dim.0, 1
  store i64 %invar.inc, i64* %fusion.1.invar_address.dim.0
  br label %fusion.1.loop_header.dim.0

fusion.1.loop_exit.dim.0:                         ; preds = %fusion.1.loop_header.dim.0
  %16 = getelementptr inbounds i8*, i8** %buffer_table, i64 4
  %17 = load i8*, i8** %16, !invariant.load !0, !dereferenceable !9, !align !2
  %parameter.1 = bitcast i8* %17 to float*
  %18 = getelementptr inbounds i8*, i8** %buffer_table, i64 2
  %19 = load i8*, i8** %18, !invariant.load !0, !dereferenceable !10, !align !2
  %parameter.2 = bitcast i8* %19 to [3 x [1 x float]]*
  %20 = getelementptr inbounds i8*, i8** %buffer_table, i64 0
  %21 = load i8*, i8** %20, !invariant.load !0, !dereferenceable !11, !align !2
  %fusion = bitcast i8* %21 to [2 x [3 x [4 x float]]]*
  store i64 0, i64* %fusion.invar_address.dim.0
  br label %fusion.loop_header.dim.0

fusion.loop_header.dim.0:                         ; preds = %fusion.loop_exit.dim.1, %fusion.1.loop_exit.dim.0
  %fusion.indvar.dim.0 = load i64, i64* %fusion.invar_address.dim.0
  %22 = icmp uge i64 %fusion.indvar.dim.0, 2
  br i1 %22, label %fusion.loop_exit.dim.0, label %fusion.loop_body.dim.0

fusion.loop_body.dim.0:                           ; preds = %fusion.loop_header.dim.0
  store i64 0, i64* %fusion.invar_address.dim.1
  br label %fusion.loop_header.dim.1

fusion.loop_header.dim.1:                         ; preds = %fusion.loop_exit.dim.2, %fusion.loop_body.dim.0
  %fusion.indvar.dim.1 = load i64, i64* %fusion.invar_address.dim.1
  %23 = icmp uge i64 %fusion.indvar.dim.1, 3
  br i1 %23, label %fusion.loop_exit.dim.1, label %fusion.loop_body.dim.1

fusion.loop_body.dim.1:                           ; preds = %fusion.loop_header.dim.1
  store i64 0, i64* %fusion.invar_address.dim.2
  br label %fusion.loop_header.dim.2

fusion.loop_header.dim.2:                         ; preds = %fusion.loop_body.dim.2, %fusion.loop_body.dim.1
  %fusion.indvar.dim.2 = load i64, i64* %fusion.invar_address.dim.2
  %24 = icmp uge i64 %fusion.indvar.dim.2, 4
  br i1 %24, label %fusion.loop_exit.dim.2, label %fusion.loop_body.dim.2

fusion.loop_body.dim.2:                           ; preds = %fusion.loop_header.dim.2
  %25 = mul nuw nsw i64 %fusion.indvar.dim.2, 1
  %26 = add nuw nsw i64 0, %25
  %27 = udiv i64 %26, 4
  %28 = mul nuw nsw i64 %fusion.indvar.dim.0, 1
  %29 = add nuw nsw i64 0, %28
  %30 = udiv i64 %29, 2
  %31 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %fusion.1, i64 0, i64 %29, i64 0, i64 %26
  %32 = load float, float* %31, !alias.scope !7, !noalias !8
  %33 = mul nuw nsw i64 %fusion.indvar.dim.1, 1
  %34 = add nuw nsw i64 0, %33
  %35 = udiv i64 %34, 3
  %36 = load float, float* %parameter.1, !invariant.load !0, !noalias !3
  %37 = getelementptr inbounds [3 x [1 x float]], [3 x [1 x float]]* %parameter.2, i64 0, i64 %34, i64 0
  %38 = load float, float* %37, !invariant.load !0, !noalias !3
  %39 = fsub float %36, %38
  %40 = fmul float %39, %39
  %41 = mul nuw nsw i64 %fusion.indvar.dim.2, 1
  %42 = add nuw nsw i64 0, %41
  %43 = udiv i64 %42, 4
  %44 = mul nuw nsw i64 %fusion.indvar.dim.0, 1
  %45 = add nuw nsw i64 0, %44
  %46 = udiv i64 %45, 2
  %47 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %45, i64 0, i64 %42
  %48 = load float, float* %47, !invariant.load !0, !noalias !3
  %49 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %45, i64 0, i64 %42
  %50 = load float, float* %49, !invariant.load !0, !noalias !3
  %51 = fmul float %48, %50
  %52 = fdiv float %40, %51
  %53 = fadd float %32, %52
  %54 = fneg float %53
  %55 = load float, float* bitcast ([4 x i8]* @1 to float*)
  %56 = fmul float %54, %55
  %57 = getelementptr inbounds [2 x [3 x [4 x float]]], [2 x [3 x [4 x float]]]* %fusion, i64 0, i64 %fusion.indvar.dim.0, i64 %fusion.indvar.dim.1, i64 %fusion.indvar.dim.2
  store float %56, float* %57, !alias.scope !8, !noalias !12
  %invar.inc5 = add nuw nsw i64 %fusion.indvar.dim.2, 1
  store i64 %invar.inc5, i64* %fusion.invar_address.dim.2
  br label %fusion.loop_header.dim.2

fusion.loop_exit.dim.2:                           ; preds = %fusion.loop_header.dim.2
  %invar.inc4 = add nuw nsw i64 %fusion.indvar.dim.1, 1
  store i64 %invar.inc4, i64* %fusion.invar_address.dim.1
  br label %fusion.loop_header.dim.1

fusion.loop_exit.dim.1:                           ; preds = %fusion.loop_header.dim.1
  %invar.inc3 = add nuw nsw i64 %fusion.indvar.dim.0, 1
  store i64 %invar.inc3, i64* %fusion.invar_address.dim.0
  br label %fusion.loop_header.dim.0

fusion.loop_exit.dim.0:                           ; preds = %fusion.loop_header.dim.0
  %58 = getelementptr inbounds i8*, i8** %buffer_table, i64 3
  %59 = load i8*, i8** %58, !invariant.load !0, !dereferenceable !2, !align !2
  %tuple.30 = bitcast i8* %59 to [1 x i8*]*
  %60 = bitcast [2 x [3 x [4 x float]]]* %fusion to i8*
  %61 = getelementptr inbounds [1 x i8*], [1 x i8*]* %tuple.30, i64 0, i64 0
  store i8* %60, i8** %61, !alias.scope !14, !noalias !8
  ret void
}

; Function Attrs: nounwind readnone speculatable willreturn
declare float @llvm.log.f32(float) #1

attributes #0 = { uwtable "no-frame-pointer-elim"="false" }
attributes #1 = { nounwind readnone speculatable willreturn }

!0 = !{}
!1 = !{i64 32}
!2 = !{i64 8}
!3 = !{!4, !6}
!4 = !{!"buffer: {index:0, offset:0, size:96}", !5}
!5 = !{!"XLA global AA domain"}
!6 = !{!"buffer: {index:5, offset:0, size:32}", !5}
!7 = !{!6}
!8 = !{!4}
!9 = !{i64 4}
!10 = !{i64 12}
!11 = !{i64 96}
!12 = !{!13, !6}
!13 = !{!"buffer: {index:3, offset:0, size:8}", !5}
!14 = !{!13}

gets (correctly) optimized to the one below without the change:

; ModuleID = '__compute_module'
source_filename = "__compute_module"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"

; Function Attrs: nofree nounwind uwtable
define void @jit_wrapped_fun.31(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
entry:
  %0 = getelementptr inbounds i8*, i8** %buffer_table, i64 1
  %1 = bitcast i8** %0 to [2 x [1 x [4 x float]]]**
  %2 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %1, align 8, !invariant.load !0, !dereferenceable !1, !align !2
  %3 = getelementptr inbounds i8*, i8** %buffer_table, i64 5
  %4 = bitcast i8** %3 to [2 x [1 x [4 x float]]]**
  %5 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %4, align 8, !invariant.load !0, !dereferenceable !1, !align !2
  %6 = bitcast [2 x [1 x [4 x float]]]* %2 to <4 x float>*
  %7 = load <4 x float>, <4 x float>* %6, align 8, !invariant.load !0, !noalias !3
  %8 = fmul <4 x float> %7, %7
  %9 = fmul <4 x float> %8, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000>
  %10 = call <4 x float> @llvm.log.v4f32(<4 x float> %9)
  %11 = bitcast [2 x [1 x [4 x float]]]* %5 to <4 x float>*
  store <4 x float> %10, <4 x float>* %11, align 8, !alias.scope !7, !noalias !8
  %12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0
  %13 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0
  %14 = bitcast float* %12 to <4 x float>*
  %15 = load <4 x float>, <4 x float>* %14, align 8, !invariant.load !0, !noalias !3
  %16 = fmul <4 x float> %15, %15
  %17 = fmul <4 x float> %16, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000>
  %18 = call <4 x float> @llvm.log.v4f32(<4 x float> %17)
  %19 = bitcast float* %13 to <4 x float>*
  store <4 x float> %18, <4 x float>* %19, align 8, !alias.scope !7, !noalias !8
  %20 = getelementptr inbounds i8*, i8** %buffer_table, i64 4
  %21 = bitcast i8** %20 to float**
  %22 = load float*, float** %21, align 8, !invariant.load !0, !dereferenceable !9, !align !2
  %23 = getelementptr inbounds i8*, i8** %buffer_table, i64 2
  %24 = bitcast i8** %23 to [3 x [1 x float]]**
  %25 = load [3 x [1 x float]]*, [3 x [1 x float]]** %24, align 8, !invariant.load !0, !dereferenceable !10, !align !2
  %26 = load i8*, i8** %buffer_table, align 8, !invariant.load !0, !dereferenceable !11, !align !2
  %27 = load float, float* %22, align 8, !invariant.load !0, !noalias !3
  %.phi.trans.insert28 = getelementptr inbounds [3 x [1 x float]], [3 x [1 x float]]* %25, i64 0, i64 2, i64 0
  %.pre29 = load float, float* %.phi.trans.insert28, align 8, !invariant.load !0, !noalias !3
  %28 = bitcast [3 x [1 x float]]* %25 to <2 x float>*
  %29 = load <2 x float>, <2 x float>* %28, align 8, !invariant.load !0, !noalias !3
  %30 = insertelement <2 x float> undef, float %27, i32 0
  %31 = shufflevector <2 x float> %30, <2 x float> undef, <2 x i32> zeroinitializer
  %32 = fsub <2 x float> %31, %29
  %33 = fmul <2 x float> %32, %32
  %shuffle30 = shufflevector <2 x float> %33, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
  %34 = fsub float %27, %.pre29
  %35 = fmul float %34, %34
  %36 = insertelement <4 x float> undef, float %35, i32 0
  %37 = shufflevector <4 x float> %36, <4 x float> undef, <4 x i32> zeroinitializer
  %shuffle = shufflevector <4 x float> %10, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  %38 = fmul <4 x float> %7, %7
  %shuffle31 = shufflevector <4 x float> %38, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  %39 = fdiv <8 x float> %shuffle30, %shuffle31
  %40 = fadd <8 x float> %shuffle, %39
  %41 = fmul <8 x float> %40, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
  %42 = bitcast i8* %26 to <8 x float>*
  store <8 x float> %41, <8 x float>* %42, align 8, !alias.scope !8, !noalias !12
  %43 = getelementptr inbounds i8, i8* %26, i64 32
  %44 = fdiv <4 x float> %37, %38
  %45 = fadd <4 x float> %10, %44
  %46 = fmul <4 x float> %45, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
  %47 = bitcast i8* %43 to <4 x float>*
  store <4 x float> %46, <4 x float>* %47, align 8, !alias.scope !8, !noalias !12
  %.phi.trans.insert = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0
  %.phi.trans.insert12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0
  %48 = bitcast float* %.phi.trans.insert to <4 x float>*
  %49 = load <4 x float>, <4 x float>* %48, align 8, !alias.scope !7, !noalias !8
  %50 = bitcast float* %.phi.trans.insert12 to <4 x float>*
  %51 = load <4 x float>, <4 x float>* %50, align 8, !invariant.load !0, !noalias !3
  %shuffle.1 = shufflevector <4 x float> %49, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  %52 = getelementptr inbounds i8, i8* %26, i64 48
  %53 = fmul <4 x float> %51, %51
  %shuffle31.1 = shufflevector <4 x float> %53, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  %54 = fdiv <8 x float> %shuffle30, %shuffle31.1
  %55 = fadd <8 x float> %shuffle.1, %54
  %56 = fmul <8 x float> %55, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
  %57 = bitcast i8* %52 to <8 x float>*
  store <8 x float> %56, <8 x float>* %57, align 8, !alias.scope !8, !noalias !12
  %58 = getelementptr inbounds i8, i8* %26, i64 80
  %59 = fdiv <4 x float> %37, %53
  %60 = fadd <4 x float> %49, %59
  %61 = fmul <4 x float> %60, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
  %62 = bitcast i8* %58 to <4 x float>*
  store <4 x float> %61, <4 x float>* %62, align 8, !alias.scope !8, !noalias !12
  %63 = getelementptr inbounds i8*, i8** %buffer_table, i64 3
  %64 = bitcast i8** %63 to [1 x i8*]**
  %65 = load [1 x i8*]*, [1 x i8*]** %64, align 8, !invariant.load !0, !dereferenceable !2, !align !2
  %66 = getelementptr inbounds [1 x i8*], [1 x i8*]* %65, i64 0, i64 0
  store i8* %26, i8** %66, align 8, !alias.scope !14, !noalias !8
  ret void
}

; Function Attrs: nounwind readnone speculatable willreturn
declare <4 x float> @llvm.log.v4f32(<4 x float>) #1

attributes #0 = { nofree nounwind uwtable "no-frame-pointer-elim"="false" }
attributes #1 = { nounwind readnone speculatable willreturn }

!0 = !{}
!1 = !{i64 32}
!2 = !{i64 8}
!3 = !{!4, !6}
!4 = !{!"buffer: {index:0, offset:0, size:96}", !5}
!5 = !{!"XLA global AA domain"}
!6 = !{!"buffer: {index:5, offset:0, size:32}", !5}
!7 = !{!6}
!8 = !{!4}
!9 = !{i64 4}
!10 = !{i64 12}
!11 = !{i64 96}
!12 = !{!13, !6}
!13 = !{!"buffer: {index:3, offset:0, size:8}", !5}
!14 = !{!13}

and (incorrectly) optimized to the one below with the change:

; ModuleID = '__compute_module'
source_filename = "__compute_module"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"

; Function Attrs: nofree nounwind uwtable
define void @jit_wrapped_fun.31(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
entry:
  %0 = getelementptr inbounds i8*, i8** %buffer_table, i64 1
  %1 = bitcast i8** %0 to [2 x [1 x [4 x float]]]**
  %2 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %1, align 8, !invariant.load !0, !dereferenceable !1, !align !2
  %3 = getelementptr inbounds i8*, i8** %buffer_table, i64 5
  %4 = bitcast i8** %3 to [2 x [1 x [4 x float]]]**
  %5 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %4, align 8, !invariant.load !0, !dereferenceable !1, !align !2
  %6 = bitcast [2 x [1 x [4 x float]]]* %2 to <4 x float>*
  %7 = load <4 x float>, <4 x float>* %6, align 8, !invariant.load !0, !noalias !3
  %8 = fmul <4 x float> %7, %7
  %9 = fmul <4 x float> %8, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000>
  %10 = call <4 x float> @llvm.log.v4f32(<4 x float> %9)
  %11 = bitcast [2 x [1 x [4 x float]]]* %5 to <4 x float>*
  store <4 x float> %10, <4 x float>* %11, align 8, !alias.scope !7, !noalias !8
  %12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0
  %13 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0
  %14 = bitcast float* %12 to <4 x float>*
  %15 = load <4 x float>, <4 x float>* %14, align 8, !invariant.load !0, !noalias !3
  %16 = fmul <4 x float> %15, %15
  %17 = fmul <4 x float> %16, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000>
  %18 = call <4 x float> @llvm.log.v4f32(<4 x float> %17)
  %19 = bitcast float* %13 to <4 x float>*
  store <4 x float> %18, <4 x float>* %19, align 8, !alias.scope !7, !noalias !8
  %20 = getelementptr inbounds i8*, i8** %buffer_table, i64 4
  %21 = bitcast i8** %20 to float**
  %22 = load float*, float** %21, align 8, !invariant.load !0, !dereferenceable !9, !align !2
  %23 = getelementptr inbounds i8*, i8** %buffer_table, i64 2
  %24 = bitcast i8** %23 to [3 x [1 x float]]**
  %25 = load [3 x [1 x float]]*, [3 x [1 x float]]** %24, align 8, !invariant.load !0, !dereferenceable !10, !align !2
  %26 = load i8*, i8** %buffer_table, align 8, !invariant.load !0, !dereferenceable !11, !align !2
  %27 = load float, float* %22, align 8, !invariant.load !0, !noalias !3
  %.phi.trans.insert28 = getelementptr inbounds [3 x [1 x float]], [3 x [1 x float]]* %25, i64 0, i64 2, i64 0
  %.pre29 = load float, float* %.phi.trans.insert28, align 8, !invariant.load !0, !noalias !3
  %28 = bitcast [3 x [1 x float]]* %25 to <2 x float>*
  %29 = load <2 x float>, <2 x float>* %28, align 8, !invariant.load !0, !noalias !3
  %30 = insertelement <2 x float> undef, float %27, i32 0
  %31 = shufflevector <2 x float> %30, <2 x float> undef, <2 x i32> zeroinitializer
  %32 = fsub <2 x float> %31, %29
  %33 = fmul <2 x float> %32, %32
  %shuffle32 = shufflevector <2 x float> %33, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
  %34 = fsub float %27, %.pre29
  %35 = fmul float %34, %34
  %36 = insertelement <4 x float> undef, float %35, i32 0
  %37 = shufflevector <4 x float> %36, <4 x float> undef, <4 x i32> zeroinitializer
  %shuffle = shufflevector <4 x float> %10, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  %38 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 0, i64 0, i64 3
  %39 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 0, i64 0, i64 3
  %40 = fmul <4 x float> %7, %7
  %41 = shufflevector <4 x float> %40, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %42 = fdiv <8 x float> %shuffle32, %41
  %43 = fadd <8 x float> %shuffle, %42
  %44 = fmul <8 x float> %43, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
  %45 = bitcast i8* %26 to <8 x float>*
  store <8 x float> %44, <8 x float>* %45, align 8, !alias.scope !8, !noalias !12
  %46 = extractelement <4 x float> %10, i32 0
  %47 = getelementptr inbounds i8, i8* %26, i64 32
  %48 = extractelement <4 x float> %10, i32 1
  %49 = extractelement <4 x float> %10, i32 2
  %50 = load float, float* %38, align 4, !alias.scope !7, !noalias !8
  %51 = load float, float* %39, align 4, !invariant.load !0, !noalias !3
  %52 = fmul float %51, %51
  %53 = insertelement <4 x float> undef, float %52, i32 3
  %54 = fdiv <4 x float> %37, %53
  %55 = insertelement <4 x float> undef, float %46, i32 0
  %56 = insertelement <4 x float> %55, float %48, i32 1
  %57 = insertelement <4 x float> %56, float %49, i32 2
  %58 = insertelement <4 x float> %57, float %50, i32 3
  %59 = fadd <4 x float> %58, %54
  %60 = fmul <4 x float> %59, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
  %61 = bitcast i8* %47 to <4 x float>*
  store <4 x float> %60, <4 x float>* %61, align 8, !alias.scope !8, !noalias !12
  %.phi.trans.insert = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0
  %.phi.trans.insert12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0
  %62 = bitcast float* %.phi.trans.insert to <4 x float>*
  %63 = load <4 x float>, <4 x float>* %62, align 8, !alias.scope !7, !noalias !8
  %64 = bitcast float* %.phi.trans.insert12 to <4 x float>*
  %65 = load <4 x float>, <4 x float>* %64, align 8, !invariant.load !0, !noalias !3
  %shuffle.1 = shufflevector <4 x float> %63, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  %66 = getelementptr inbounds i8, i8* %26, i64 48
  %67 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 3
  %68 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 3
  %69 = fmul <4 x float> %65, %65
  %70 = shufflevector <4 x float> %69, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  %71 = fdiv <8 x float> %shuffle32, %70
  %72 = fadd <8 x float> %shuffle.1, %71
  %73 = fmul <8 x float> %72, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
  %74 = bitcast i8* %66 to <8 x float>*
  store <8 x float> %73, <8 x float>* %74, align 8, !alias.scope !8, !noalias !12
  %75 = extractelement <4 x float> %69, i32 0
  %76 = extractelement <4 x float> %63, i32 0
  %77 = getelementptr inbounds i8, i8* %26, i64 80
  %78 = extractelement <4 x float> %69, i32 1
  %79 = extractelement <4 x float> %63, i32 1
  %80 = extractelement <4 x float> %69, i32 2
  %81 = extractelement <4 x float> %63, i32 2
  %82 = load float, float* %67, align 4, !alias.scope !7, !noalias !8
  %83 = load float, float* %68, align 4, !invariant.load !0, !noalias !3
  %84 = fmul float %83, %83
  %85 = insertelement <4 x float> undef, float %75, i32 0
  %86 = insertelement <4 x float> %85, float %78, i32 1
  %87 = insertelement <4 x float> %86, float %80, i32 2
  %88 = insertelement <4 x float> %87, float %84, i32 3
  %89 = fdiv <4 x float> %37, %88
  %90 = insertelement <4 x float> undef, float %76, i32 0
  %91 = insertelement <4 x float> %90, float %79, i32 1
  %92 = insertelement <4 x float> %91, float %81, i32 2
  %93 = insertelement <4 x float> %92, float %82, i32 3
  %94 = fadd <4 x float> %93, %89
  %95 = fmul <4 x float> %94, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
  %96 = bitcast i8* %77 to <4 x float>*
  store <4 x float> %95, <4 x float>* %96, align 8, !alias.scope !8, !noalias !12
  %97 = getelementptr inbounds i8*, i8** %buffer_table, i64 3
  %98 = bitcast i8** %97 to [1 x i8*]**
  %99 = load [1 x i8*]*, [1 x i8*]** %98, align 8, !invariant.load !0, !dereferenceable !2, !align !2
  %100 = getelementptr inbounds [1 x i8*], [1 x i8*]* %99, i64 0, i64 0
  store i8* %26, i8** %100, align 8, !alias.scope !14, !noalias !8
  ret void
}

; Function Attrs: nounwind readnone speculatable willreturn
declare <4 x float> @llvm.log.v4f32(<4 x float>) #1

attributes #0 = { nofree nounwind uwtable "no-frame-pointer-elim"="false" }
attributes #1 = { nounwind readnone speculatable willreturn }

!0 = !{}
!1 = !{i64 32}
!2 = !{i64 8}
!3 = !{!4, !6}
!4 = !{!"buffer: {index:0, offset:0, size:96}", !5}
!5 = !{!"XLA global AA domain"}
!6 = !{!"buffer: {index:5, offset:0, size:32}", !5}
!7 = !{!6}
!8 = !{!4}
!9 = !{i64 4}
!10 = !{i64 12}
!11 = !{i64 96}
!12 = !{!13, !6}
!13 = !{!"buffer: {index:3, offset:0, size:8}", !5}
!14 = !{!13}

This results in bad numerical answers when used through XLA.
Again, it's not that easy to give a small fully-reproducible example, but the misscompare is:

Expected literal:
(
f32[2,3,4] {
{
  { nan, -inf, -3181.35, -inf },
  { nan, -inf, -28.2577019, -inf },
  { nan, -inf, -28.2577019, -inf }
},
{
  { -inf, -inf, -inf, -inf },
  { -6.60753046e+28, -1.47314833e+23, -inf, -inf },
  { -2.43504347e+30, -5.42892693e+24, -inf, -inf }
}
}
)

Actual literal:
(
f32[2,3,4] {
{
  { nan, -inf, -3181.35, -inf },
  { nan, -inf, -inf, -inf },
  { inf, -inf, -28.2577019, -inf }
},
{
  { -inf, -inf, -inf, -inf },
  { -6.60753046e+28, -1.47314833e+23, -inf, -inf },
  { -2.43504347e+30, -5.42892693e+24, -inf, -inf }
}
}
)

Diff Detail

Event Timeline

Herald added a project: Restricted Project. · View Herald TranscriptWed, Nov 20, 3:23 PM
george.karpenkov edited the summary of this revision. (Show Details)Wed, Nov 20, 3:33 PM

Would you be OK speculatively reverting this change?

Would you be OK speculatively reverting this change?

If you're reasonable sure this is miscompiling, definitely revert. Help narrowing it down would be appreciated though.

If you're reasonable sure this is miscompiling, definitely revert. Help narrowing it down would be appreciated though.

Yes, we're seeing multiple different failures from this commit. Let me try to narrow it down to a smaller IR input.

Looking at the provided IRs I can see what happened but I can't understand why. Provided IR is fine for me to continue investigation. I just need to know command line to reproduce. Interestingly, if I apply instcombine to initial IR (results are identical with and without my change) it is simplified to the form which I believe can't result in the provided "incorrect" version of IR.

This revision was not accepted when it landed; it landed in state Needs Review.Thu, Nov 21, 2:40 AM
This revision was automatically updated to reflect the committed changes.

Pushed this in 446acafb82b5c116b6c94c11d4ac4db7641fa58d. The difference is visible with opt -O3, right? Or do you need an executable test (that's hard)

Pushed this in 446acafb82b5c116b6c94c11d4ac4db7641fa58d. The difference is visible with opt -O3, right? Or do you need an executable test (that's hard)

Yes, I was able to reproduce. Thanks!