diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -13826,12 +13826,14 @@ case X86::BI__builtin_ia32_reduce_fadd_ps512: { Function *F = CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType()); + Builder.getFastMathFlags().setAllowReassoc(true); return Builder.CreateCall(F, {Ops[0], Ops[1]}); } case X86::BI__builtin_ia32_reduce_fmul_pd512: case X86::BI__builtin_ia32_reduce_fmul_ps512: { Function *F = CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType()); + Builder.getFastMathFlags().setAllowReassoc(true); return Builder.CreateCall(F, {Ops[0], Ops[1]}); } case X86::BI__builtin_ia32_reduce_mul_d512: diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -9300,6 +9300,9 @@ * computations. In vector-reduction arithmetic, the evaluation off is * independent of the order of the input elements of V. + * For floating points type, we always assume the elements are reassociable even + * if -fast-math is off. + * Used bisection method. At each step, we partition the vector with previous * step in half, and the operation is performed on its two halves. * This takes log2(n) steps where n is the number of elements in the vector. diff --git a/clang/test/CodeGen/X86/avx512-reduceIntrin.c b/clang/test/CodeGen/X86/avx512-reduceIntrin.c --- a/clang/test/CodeGen/X86/avx512-reduceIntrin.c +++ b/clang/test/CodeGen/X86/avx512-reduceIntrin.c @@ -115,25 +115,25 @@ double test_mm512_reduce_add_pd(__m512d __W){ // CHECK-LABEL: @test_mm512_reduce_add_pd( -// CHECK: call double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> %{{.*}}) +// CHECK: call reassoc double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> %{{.*}}) return _mm512_reduce_add_pd(__W); } double test_mm512_reduce_mul_pd(__m512d __W){ // CHECK-LABEL: @test_mm512_reduce_mul_pd( -// CHECK: call double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}}) +// CHECK: call reassoc double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}}) return _mm512_reduce_mul_pd(__W); } float test_mm512_reduce_add_ps(__m512 __W){ // CHECK-LABEL: @test_mm512_reduce_add_ps( -// CHECK: call float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %{{.*}}) +// CHECK: call reassoc float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %{{.*}}) return _mm512_reduce_add_ps(__W); } float test_mm512_reduce_mul_ps(__m512 __W){ // CHECK-LABEL: @test_mm512_reduce_mul_ps( -// CHECK: call float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}}) +// CHECK: call reassoc float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}}) return _mm512_reduce_mul_ps(__W); } @@ -141,7 +141,7 @@ // CHECK-LABEL: @test_mm512_mask_reduce_add_pd( // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} -// CHECK: call double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> %{{.*}}) +// CHECK: call reassoc double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> %{{.*}}) return _mm512_mask_reduce_add_pd(__M, __W); } @@ -149,7 +149,7 @@ // CHECK-LABEL: @test_mm512_mask_reduce_mul_pd( // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} -// CHECK: call double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}}) +// CHECK: call reassoc double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}}) return _mm512_mask_reduce_mul_pd(__M, __W); } @@ -157,7 +157,7 @@ // CHECK-LABEL: @test_mm512_mask_reduce_add_ps( // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> {{.*}} -// CHECK: call float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %{{.*}}) +// CHECK: call reassoc float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %{{.*}}) return _mm512_mask_reduce_add_ps(__M, __W); } @@ -165,6 +165,6 @@ // CHECK-LABEL: @test_mm512_mask_reduce_mul_ps( // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> %{{.*}} -// CHECK: call float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}}) +// CHECK: call reassoc float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}}) return _mm512_mask_reduce_mul_ps(__M, __W); }