Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -13455,37 +13455,34 @@ """""""""" The argument to this intrinsic must be a vector of integer values. -'``llvm.experimental.vector.reduce.fadd.*``' Intrinsic -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +'``llvm.experimental.vector.reduce.ordered.fadd.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: """"""" :: - declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %a) - declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %acc, <2 x double> %a) + declare float @llvm.experimental.vector.reduce.ordered.fadd.f32.v4f32(float %start_value, <4 x float> %a) + declare double @llvm.experimental.vector.reduce.ordered.fadd.f64.v2f64(double %start_value, <2 x double> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.fadd.*``' intrinsics do a floating-point -``ADD`` reduction of a vector, returning the result as a scalar. The return type -matches the element-type of the vector input. +The '``llvm.experimental.vector.reduce.ordered.fadd.*``' intrinsics do an +ordered floating-point ``ADD`` reduction of a vector starting from an +start value and return the result as a scalar. The return type matches +the element-type of the vector input and the type of the start value. -If the intrinsic call has fast-math flags, then the reduction will not preserve -the associativity of an equivalent scalarized counterpart. If it does not have -fast-math flags, then the reduction will be *ordered*, implying that the -operation respects the associativity of a scalarized reduction. +The reduction will preserve the associativity of an equivalent scalarized +reduction regardless of any fast-math flags specified on the call instruction. Arguments: """""""""" -The first argument to this intrinsic is a scalar accumulator value, which is -only used when there are no fast-math flags attached. This argument may be undef -when fast-math flags are used. The type of the accumulator matches the -element-type of the vector input. - +The first argument to this intrinsic is a scalar start value that acts +as the start value of the ordered reduction. The type of the start value +matches the element-type of the vector input. The second argument must be a vector of floating-point values. Examples: @@ -13493,8 +13490,41 @@ .. code-block:: llvm - %fast = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %input) ; fast reduction - %ord = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction + %red = call float @llvm.experimental.vector.reduce.ordered.fadd.f32.v4f32(float 0.0, <4 x float> %input) ; ordered reduction starting with value 0.0 + + +'``llvm.experimental.vector.reduce.unordered.fadd.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare float @llvm.experimental.vector.reduce.unordered.fadd.v4f32(<4 x float> %a) + declare double @llvm.experimental.vector.reduce.unordered.fadd.v2f64(<2 x double> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.unordered.fadd.*``' intrinsics do an +unordered floating-point ``ADD`` reduction of a vector, returning the result +as a scalar. The return type matches the element-type of the vector input. + +The reduction does not need to preserve the associativity of an equivalent +scalarized reduction. + + +Arguments: +"""""""""" +The argument must be a vector of floating-point values. + +Examples: +""""""""" + +.. code-block:: llvm + + %red = call float @llvm.experimental.vector.reduce.unordered.fadd.v4f32(<4 x float> %input) '``llvm.experimental.vector.reduce.mul.*``' Intrinsic @@ -13519,37 +13549,35 @@ """""""""" The argument to this intrinsic must be a vector of integer values. -'``llvm.experimental.vector.reduce.fmul.*``' Intrinsic -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +'``llvm.experimental.vector.reduce.ordered.fmul.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: """"""" :: - declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %a) - declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %acc, <2 x double> %a) + declare float @llvm.experimental.vector.reduce.ordered.fmul.f32.v4f32(float %start_value, <4 x float> %a) + declare double @llvm.experimental.vector.reduce.ordered.fmul.f64.v2f64(double %start_value, <2 x double> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.fmul.*``' intrinsics do a floating-point -``MUL`` reduction of a vector, returning the result as a scalar. The return type -matches the element-type of the vector input. +The '``llvm.experimental.vector.reduce.ordered.fmul.*``' intrinsics do an +ordered floating-point ``MUL`` reduction of a vector starting from an +start value and return the result as a scalar. The return type matches +the element-type of the vector input and the type of the start value. -If the intrinsic call has fast-math flags, then the reduction will not preserve -the associativity of an equivalent scalarized counterpart. If it does not have -fast-math flags, then the reduction will be *ordered*, implying that the -operation respects the associativity of a scalarized reduction. +The reduction will preserve the associativity of an equivalent scalarized +reduction regardless of any fast-math flags specified on the call instruction. Arguments: """""""""" -The first argument to this intrinsic is a scalar accumulator value, which is -only used when there are no fast-math flags attached. This argument may be undef -when fast-math flags are used. The type of the accumulator matches the -element-type of the vector input. - +The first argument to this intrinsic is a scalar start value that acts +as the start value of the ordered reduction. The type of the start value +matches the element-type of the vector input. The second argument must be a vector of floating-point values. Examples: @@ -13557,8 +13585,42 @@ .. code-block:: llvm - %fast = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %input) ; fast reduction - %ord = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction + %red = call float @llvm.experimental.vector.reduce.ordered.fmul.f32.v4f32(float 1.0, <4 x float> %input) ; ordered reduction starting with value 1.0 + + +'``llvm.experimental.vector.reduce.unordered.fmul.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare float @llvm.experimental.vector.reduce.unordered.fmul.v4f32(<4 x float> %a) + declare double @llvm.experimental.vector.reduce.unordered.fmul.v2f64(<2 x double> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.unordered.fmul.*``' intrinsics do an +unordered floating-point ``MUL`` reduction of a vector, returning the result +as a scalar. The return type matches the element-type of the vector input. + +The reduction does not need to preserve the associativity of an equivalent +scalarized reduction. + + +Arguments: +"""""""""" +The argument must be a vector of floating-point values. + +Examples: +""""""""" + +.. code-block:: llvm + + %red = call float @llvm.experimental.vector.reduce.unordered.fmul.v4f32(<4 x float> %input) + '``llvm.experimental.vector.reduce.and.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -1057,14 +1057,16 @@ case Intrinsic::experimental_vector_reduce_and: case Intrinsic::experimental_vector_reduce_or: case Intrinsic::experimental_vector_reduce_xor: - case Intrinsic::experimental_vector_reduce_fadd: - case Intrinsic::experimental_vector_reduce_fmul: case Intrinsic::experimental_vector_reduce_smax: case Intrinsic::experimental_vector_reduce_smin: case Intrinsic::experimental_vector_reduce_fmax: case Intrinsic::experimental_vector_reduce_fmin: case Intrinsic::experimental_vector_reduce_umax: case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::experimental_vector_reduce_ordered_fadd: + case Intrinsic::experimental_vector_reduce_unordered_fadd: + case Intrinsic::experimental_vector_reduce_ordered_fmul: + case Intrinsic::experimental_vector_reduce_unordered_fmul: return getIntrinsicInstrCost(IID, RetTy, Args[0]->getType(), FMF); case Intrinsic::fshl: case Intrinsic::fshr: { @@ -1248,12 +1250,20 @@ case Intrinsic::experimental_vector_reduce_xor: return ConcreteTTI->getArithmeticReductionCost(Instruction::Xor, Tys[0], /*IsPairwiseForm=*/false); - case Intrinsic::experimental_vector_reduce_fadd: + case Intrinsic::experimental_vector_reduce_ordered_fadd: return ConcreteTTI->getArithmeticReductionCost(Instruction::FAdd, Tys[0], /*IsPairwiseForm=*/false); - case Intrinsic::experimental_vector_reduce_fmul: + case Intrinsic::experimental_vector_reduce_unordered_fadd: + return ConcreteTTI->getArithmeticReductionCost( + Instruction::FAdd, Tys[0]->getVectorElementType(), + /*IsPairwiseForm=*/false); // FIXME: This should be set to 'true' + case Intrinsic::experimental_vector_reduce_ordered_fmul: return ConcreteTTI->getArithmeticReductionCost(Instruction::FMul, Tys[0], /*IsPairwiseForm=*/false); + case Intrinsic::experimental_vector_reduce_unordered_fmul: + return ConcreteTTI->getArithmeticReductionCost( + Instruction::FMul, Tys[0], + /*IsPairwiseForm=*/false); // FIXME: This should be set to 'true' case Intrinsic::experimental_vector_reduce_smax: case Intrinsic::experimental_vector_reduce_smin: case Intrinsic::experimental_vector_reduce_fmax: Index: include/llvm/IR/IRBuilder.h =================================================================== --- include/llvm/IR/IRBuilder.h +++ include/llvm/IR/IRBuilder.h @@ -528,13 +528,19 @@ MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); - /// Create a vector fadd reduction intrinsic of the source vector. + /// Create an ordered vector fadd reduction intrinsic of the source vector. /// The first parameter is a scalar accumulator value for ordered reductions. - CallInst *CreateFAddReduce(Value *Acc, Value *Src); + CallInst *CreateOrderedFAddReduce(Value *Acc, Value *Src); - /// Create a vector fmul reduction intrinsic of the source vector. + /// Create an unordered vector fadd reduction intrinsic of the source vector. + CallInst *CreateUnorderedFAddReduce(Value *Src); + + /// Create an ordered vector fmul reduction intrinsic of the source vector. /// The first parameter is a scalar accumulator value for ordered reductions. - CallInst *CreateFMulReduce(Value *Acc, Value *Src); + CallInst *CreateOrderedFMulReduce(Value *Acc, Value *Src); + + /// Create an unordered vector fmul reduction intrinsic of the source vector. + CallInst *CreateUnorderedFMulReduce(Value *Src); /// Create a vector int add reduction intrinsic of the source vector. CallInst *CreateAddReduce(Value *Src); Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -1122,14 +1122,24 @@ //===------------------------ Reduction Intrinsics ------------------------===// // -def int_experimental_vector_reduce_fadd : Intrinsic<[llvm_anyfloat_ty], +def int_experimental_vector_reduce_ordered_fadd : Intrinsic< + [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_anyvector_ty], [IntrNoMem]>; -def int_experimental_vector_reduce_fmul : Intrinsic<[llvm_anyfloat_ty], +def int_experimental_vector_reduce_unordered_fadd + : Intrinsic<[llvm_anyfloat_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_ordered_fmul : Intrinsic< + [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_anyvector_ty], [IntrNoMem]>; +def int_experimental_vector_reduce_unordered_fmul + : Intrinsic<[llvm_anyfloat_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; def int_experimental_vector_reduce_add : Intrinsic<[llvm_anyint_ty], [llvm_anyvector_ty], [IntrNoMem]>; Index: lib/CodeGen/ExpandReductions.cpp =================================================================== --- lib/CodeGen/ExpandReductions.cpp +++ lib/CodeGen/ExpandReductions.cpp @@ -29,9 +29,11 @@ unsigned getOpcode(Intrinsic::ID ID) { switch (ID) { - case Intrinsic::experimental_vector_reduce_fadd: + case Intrinsic::experimental_vector_reduce_ordered_fadd: + case Intrinsic::experimental_vector_reduce_unordered_fadd: return Instruction::FAdd; - case Intrinsic::experimental_vector_reduce_fmul: + case Intrinsic::experimental_vector_reduce_ordered_fmul: + case Intrinsic::experimental_vector_reduce_unordered_fmul: return Instruction::FMul; case Intrinsic::experimental_vector_reduce_add: return Instruction::Add; @@ -90,15 +92,16 @@ auto ID = II->getIntrinsicID(); auto MRK = RecurrenceDescriptor::MRK_Invalid; switch (ID) { - case Intrinsic::experimental_vector_reduce_fadd: - case Intrinsic::experimental_vector_reduce_fmul: - // FMFs must be attached to the call, otherwise it's an ordered reduction - // and it can't be handled by generating a shuffle sequence. - if (!II->getFastMathFlags().isFast()) - IsOrdered = true; + case Intrinsic::experimental_vector_reduce_ordered_fadd: + case Intrinsic::experimental_vector_reduce_ordered_fmul: + IsOrdered = true; Acc = II->getArgOperand(0); Vec = II->getArgOperand(1); break; + case Intrinsic::experimental_vector_reduce_unordered_fadd: + case Intrinsic::experimental_vector_reduce_unordered_fmul: + Vec = II->getArgOperand(0); + break; case Intrinsic::experimental_vector_reduce_add: case Intrinsic::experimental_vector_reduce_mul: case Intrinsic::experimental_vector_reduce_and: Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6646,8 +6646,10 @@ LowerDeoptimizeCall(&I); return nullptr; - case Intrinsic::experimental_vector_reduce_fadd: - case Intrinsic::experimental_vector_reduce_fmul: + case Intrinsic::experimental_vector_reduce_ordered_fadd: + case Intrinsic::experimental_vector_reduce_unordered_fadd: + case Intrinsic::experimental_vector_reduce_ordered_fmul: + case Intrinsic::experimental_vector_reduce_unordered_fmul: case Intrinsic::experimental_vector_reduce_add: case Intrinsic::experimental_vector_reduce_mul: case Intrinsic::experimental_vector_reduce_and: @@ -8688,17 +8690,17 @@ FMF = I.getFastMathFlags(); switch (Intrinsic) { - case Intrinsic::experimental_vector_reduce_fadd: - if (FMF.isFast()) - Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2); - else - Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2); + case Intrinsic::experimental_vector_reduce_ordered_fadd: + Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2); break; - case Intrinsic::experimental_vector_reduce_fmul: - if (FMF.isFast()) - Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2); - else - Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2); + case Intrinsic::experimental_vector_reduce_unordered_fadd: + Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_ordered_fmul: + Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2); + break; + case Intrinsic::experimental_vector_reduce_unordered_fmul: + Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op1); break; case Intrinsic::experimental_vector_reduce_add: Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1); Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -605,6 +605,16 @@ } break; } + case 'e': { + if (Name.startswith("experimental.vector.reduce.")) { + Name = Name.substr(27); + if (Name.startswith("fadd") || Name.startswith("fmul")) { + NewFn = nullptr; + return true; + } + } + break; + } case 'i': case 'l': { bool IsLifetimeStart = Name.startswith("lifetime.start"); @@ -1606,6 +1616,10 @@ if (IsNVVM) Name = Name.substr(5); + bool IsExperimentalVecReduce = Name.startswith("experimental.vector.reduce."); + if (IsExperimentalVecReduce) + Name = Name.substr(27); + if (IsX86 && Name.startswith("sse4a.movnt.")) { Module *M = F->getParent(); SmallVector Elts; @@ -3425,6 +3439,25 @@ F->getParent(), Intrinsic::convert_from_fp16, {Builder.getFloatTy()}), CI->getArgOperand(0), "h2f"); + } else if (IsExperimentalVecReduce) { + Intrinsic::ID ID = Intrinsic::not_intrinsic; + if (Name.startswith("fadd")) + ID = CI->hasAllowReassoc() + ? Intrinsic::experimental_vector_reduce_unordered_fadd + : Intrinsic::experimental_vector_reduce_ordered_fadd; + else if(Name.startswith("fmul")) + ID = CI->hasAllowReassoc() + ? Intrinsic::experimental_vector_reduce_unordered_fmul + : Intrinsic::experimental_vector_reduce_ordered_fmul; + + if (ID != Intrinsic::not_intrinsic) { + Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1); + NewFn = Intrinsic::getDeclaration(F->getParent(), ID, + {CI->getType(), Op1->getType()}); + Rep = CI->isFast() ? Builder.CreateCall(NewFn, {Op1}) + : Builder.CreateCall(NewFn, {Op0, Op1}); + cast(Rep)->copyFastMathFlags(CI); + } } else { llvm_unreachable("Unknown function for CallInst upgrade."); } Index: lib/IR/IRBuilder.cpp =================================================================== --- lib/IR/IRBuilder.cpp +++ lib/IR/IRBuilder.cpp @@ -318,24 +318,40 @@ return createCallHelper(Decl, Ops, Builder); } -CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) { +CallInst *IRBuilderBase::CreateOrderedFAddReduce(Value *Acc, Value *Src) { Module *M = GetInsertBlock()->getParent()->getParent(); Value *Ops[] = {Acc, Src}; Type *Tys[] = {Acc->getType(), Src->getType()}; auto Decl = Intrinsic::getDeclaration( - M, Intrinsic::experimental_vector_reduce_fadd, Tys); + M, Intrinsic::experimental_vector_reduce_ordered_fadd, Tys); return createCallHelper(Decl, Ops, this); } -CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) { +CallInst *IRBuilderBase::CreateUnorderedFAddReduce(Value *Src) { + Module *M = GetInsertBlock()->getParent()->getParent(); + Type *Tys[] = {Src->getType()->getVectorElementType(), Src->getType()}; + auto Decl = Intrinsic::getDeclaration( + M, Intrinsic::experimental_vector_reduce_unordered_fadd, Tys); + return createCallHelper(Decl, {Src}, this); +} + +CallInst *IRBuilderBase::CreateOrderedFMulReduce(Value *Acc, Value *Src) { Module *M = GetInsertBlock()->getParent()->getParent(); Value *Ops[] = {Acc, Src}; Type *Tys[] = {Acc->getType(), Src->getType()}; auto Decl = Intrinsic::getDeclaration( - M, Intrinsic::experimental_vector_reduce_fmul, Tys); + M, Intrinsic::experimental_vector_reduce_ordered_fmul, Tys); return createCallHelper(Decl, Ops, this); } +CallInst *IRBuilderBase::CreateUnorderedFMulReduce(Value *Src) { + Module *M = GetInsertBlock()->getParent()->getParent(); + Type *Tys[] = {Src->getType()->getVectorElementType(), Src->getType()}; + auto Decl = Intrinsic::getDeclaration( + M, Intrinsic::experimental_vector_reduce_unordered_fmul, Tys); + return createCallHelper(Decl, {Src}, this); +} + CallInst *IRBuilderBase::CreateAddReduce(Value *Src) { return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_add, Src); Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -804,7 +804,6 @@ ArrayRef RedOps) { assert(isa(Src->getType()) && "Type must be a vector"); - Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType()); std::function BuildFunc; using RD = RecurrenceDescriptor; RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid; @@ -830,14 +829,14 @@ break; case Instruction::FAdd: BuildFunc = [&]() { - auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src); + auto Rdx = Builder.CreateUnorderedFAddReduce(Src); cast(Rdx)->setFastMathFlags(FMFFast); return Rdx; }; break; case Instruction::FMul: BuildFunc = [&]() { - auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src); + auto Rdx = Builder.CreateUnorderedFMulReduce(Src); cast(Rdx)->setFastMathFlags(FMFFast); return Rdx; }; Index: test/Assembler/invalid-vecreduce.ll =================================================================== --- test/Assembler/invalid-vecreduce.ll +++ test/Assembler/invalid-vecreduce.ll @@ -1,34 +1,34 @@ ; RUN: not opt -S < %s 2>&1 | FileCheck %s ; CHECK: Intrinsic has incorrect argument type! -; CHECK-NEXT: float (double, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64 +; CHECK-NEXT: float (double, <2 x double>)* @llvm.experimental.vector.reduce.ordered.fadd.f32.f64.v2f64 define float @fadd_invalid_scalar_res(double %acc, <2 x double> %in) { - %res = call float @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in) + %res = call float @llvm.experimental.vector.reduce.ordered.fadd.f32.f64.v2f64(double %acc, <2 x double> %in) ret float %res } ; CHECK: Intrinsic has incorrect argument type! -; CHECK-NEXT: double (float, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64 +; CHECK-NEXT: double (float, <2 x double>)* @llvm.experimental.vector.reduce.ordered.fadd.f64.f32.v2f64 define double @fadd_invalid_scalar_start(float %acc, <2 x double> %in) { - %res = call double @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in) + %res = call double @llvm.experimental.vector.reduce.ordered.fadd.f64.f32.v2f64(float %acc, <2 x double> %in) ret double %res } ; CHECK: Intrinsic has incorrect argument type! -; CHECK-NEXT: <2 x double> (double, <2 x double>)* @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64 +; CHECK-NEXT: <2 x double> (double, <2 x double>)* @llvm.experimental.vector.reduce.ordered.fadd.v2f64.f64.v2f64 define <2 x double> @fadd_invalid_vector_res(double %acc, <2 x double> %in) { - %res = call <2 x double> @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in) + %res = call <2 x double> @llvm.experimental.vector.reduce.ordered.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in) ret <2 x double> %res } ; CHECK: Intrinsic has incorrect argument type! -; CHECK-NEXT: double (<2 x double>, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64 +; CHECK-NEXT: double (<2 x double>, <2 x double>)* @llvm.experimental.vector.reduce.ordered.fadd.f64.v2f64.v2f64 define double @fadd_invalid_vector_start(<2 x double> %in, <2 x double> %acc) { - %res = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in) + %res = call double @llvm.experimental.vector.reduce.ordered.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in) ret double %res } -declare float @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in) -declare double @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in) -declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in) -declare <2 x double> @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in) +declare float @llvm.experimental.vector.reduce.ordered.fadd.f32.f64.v2f64(double %acc, <2 x double> %in) +declare double @llvm.experimental.vector.reduce.ordered.fadd.f64.f32.v2f64(float %acc, <2 x double> %in) +declare double @llvm.experimental.vector.reduce.ordered.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in) +declare <2 x double> @llvm.experimental.vector.reduce.ordered.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in) Index: test/Bitcode/upgrade-vecreduce-intrinsics.ll =================================================================== --- /dev/null +++ test/Bitcode/upgrade-vecreduce-intrinsics.ll @@ -0,0 +1,66 @@ +; RUN: opt -S < %s | FileCheck %s +; RUN: llvm-dis < %s.bc | FileCheck %s + +define float @fadd_acc(<4 x float> %in, float %acc) { +; CHECK-LABEL: @fadd_acc +; CHECK: call float @llvm.experimental.vector.reduce.ordered.fadd.f32.v4f32(float %acc, <4 x float> %in) + %res = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %in) + ret float %res +} + +define float @fadd_undef(<4 x float> %in) { +; CHECK-LABEL: @fadd_undef +; CHECK: call float @llvm.experimental.vector.reduce.ordered.fadd.f32.v4f32(float undef, <4 x float> %in) + %res = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %in) + ret float %res +} + +define float @fadd_fast_acc(<4 x float> %in, float %acc) { +; CHECK-LABEL: @fadd_fast_acc +; CHECK: call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v4f32(<4 x float> %in) + %res = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %in) + ret float %res +} + +define float @fadd_fast_undef(<4 x float> %in) { +; CHECK-LABEL: @fadd_fast_undef +; CHECK: call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v4f32(<4 x float> %in) + %res = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %in) + ret float %res +} + +define float @fmul_acc(<4 x float> %in, float %acc) { +; CHECK-LABEL: @fmul_acc +; CHECK: call float @llvm.experimental.vector.reduce.ordered.fmul.f32.v4f32(float %acc, <4 x float> %in) + %res = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %in) + ret float %res +} + +define float @fmul_undef(<4 x float> %in) { +; CHECK-LABEL: @fmul_undef +; CHECK: call float @llvm.experimental.vector.reduce.ordered.fmul.f32.v4f32(float undef, <4 x float> %in) + %res = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %in) + ret float %res +} + +define float @fmul_fast_acc(<4 x float> %in, float %acc) { +; CHECK-LABEL: @fmul_fast_acc +; CHECK: call fast float @llvm.experimental.vector.reduce.unordered.fmul.f32.v4f32(<4 x float> %in) + %res = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %in) + ret float %res +} + +define float @fmul_fast_undef(<4 x float> %in) { +; CHECK-LABEL: @fmul_fast_undef +; CHECK: call fast float @llvm.experimental.vector.reduce.unordered.fmul.f32.v4f32(<4 x float> %in) + %res = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %in) + ret float %res +} + +declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +; CHECK-DAG: declare float @llvm.experimental.vector.reduce.unordered.fadd.f32.v4f32(<4 x float>) +; CHECK-DAG: declare float @llvm.experimental.vector.reduce.ordered.fadd.f32.v4f32(float, <4 x float>) + +declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>) +; CHECK-DAG: declare float @llvm.experimental.vector.reduce.unordered.fmul.f32.v4f32(<4 x float>) +; CHECK-DAG: declare float @llvm.experimental.vector.reduce.ordered.fmul.f32.v4f32(float, <4 x float>) Index: test/CodeGen/AArch64/vecreduce-fadd-legalization.ll =================================================================== --- test/CodeGen/AArch64/vecreduce-fadd-legalization.ll +++ test/CodeGen/AArch64/vecreduce-fadd-legalization.ll @@ -1,20 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK -declare half @llvm.experimental.vector.reduce.fadd.f16.v1f16(half, <1 x half>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v1f32(float, <1 x float>) -declare double @llvm.experimental.vector.reduce.fadd.f64.v1f64(double, <1 x double>) -declare fp128 @llvm.experimental.vector.reduce.fadd.f128.v1f128(fp128, <1 x fp128>) +declare half @llvm.experimental.vector.reduce.unordered.fadd.f16.v1f16(<1 x half>) +declare float @llvm.experimental.vector.reduce.unordered.fadd.f32.v1f32(<1 x float>) +declare double @llvm.experimental.vector.reduce.unordered.fadd.f64.v1f64(<1 x double>) +declare fp128 @llvm.experimental.vector.reduce.unordered.fadd.f128.v1f128(<1 x fp128>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v3f32(float, <3 x float>) -declare fp128 @llvm.experimental.vector.reduce.fadd.f128.v2f128(fp128, <2 x fp128>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float, <16 x float>) +declare float @llvm.experimental.vector.reduce.unordered.fadd.f32.v3f32(<3 x float>) +declare fp128 @llvm.experimental.vector.reduce.unordered.fadd.f128.v2f128(<2 x fp128>) +declare float @llvm.experimental.vector.reduce.unordered.fadd.f32.v16f32(<16 x float>) define half @test_v1f16(<1 x half> %a) nounwind { ; CHECK-LABEL: test_v1f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call fast nnan half @llvm.experimental.vector.reduce.fadd.f16.v1f16(half 0.0, <1 x half> %a) + %b = call fast nnan half @llvm.experimental.vector.reduce.unordered.fadd.f16.v1f16(<1 x half> %a) ret half %b } @@ -24,7 +24,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: ret - %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v1f32(float 0.0, <1 x float> %a) + %b = call fast nnan float @llvm.experimental.vector.reduce.unordered.fadd.f32.v1f32(<1 x float> %a) ret float %b } @@ -32,7 +32,7 @@ ; CHECK-LABEL: test_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call fast nnan double @llvm.experimental.vector.reduce.fadd.f64.v1f64(double 0.0, <1 x double> %a) + %b = call fast nnan double @llvm.experimental.vector.reduce.unordered.fadd.f64.v1f64(<1 x double> %a) ret double %b } @@ -40,7 +40,7 @@ ; CHECK-LABEL: test_v1f128: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call fast nnan fp128 @llvm.experimental.vector.reduce.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a) + %b = call fast nnan fp128 @llvm.experimental.vector.reduce.unordered.fadd.f128.v1f128(<1 x fp128> %a) ret fp128 %b } @@ -53,7 +53,7 @@ ; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret - %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v3f32(float 0.0, <3 x float> %a) + %b = call fast nnan float @llvm.experimental.vector.reduce.unordered.fadd.f32.v3f32(<3 x float> %a) ret float %b } @@ -64,7 +64,7 @@ ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret - %b = call fast nnan fp128 @llvm.experimental.vector.reduce.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a) + %b = call fast nnan fp128 @llvm.experimental.vector.reduce.unordered.fadd.f128.v2f128(<2 x fp128> %a) ret fp128 %b } @@ -78,6 +78,6 @@ ; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret - %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a) + %b = call fast nnan float @llvm.experimental.vector.reduce.unordered.fadd.f32.v16f32(<16 x float> %a) ret float %b } Index: test/CodeGen/AArch64/vecreduce-fadd.ll =================================================================== --- test/CodeGen/AArch64/vecreduce-fadd.ll +++ test/CodeGen/AArch64/vecreduce-fadd.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: add_HalfS: ; CHECK: faddp s0, v0.2s ; CHECK-NEXT: ret - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %bin.rdx) + %r = call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v2f32(<2 x float> %bin.rdx) ret float %r } @@ -23,7 +23,7 @@ ; CHECKNOFP16-NOT: fadd h{{[0-9]+}} ; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h ; CHECKNOFP16: ret - %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v4f16(half undef, <4 x half> %bin.rdx) + %r = call fast half @llvm.experimental.vector.reduce.unordered.fadd.f16.v4f16(<4 x half> %bin.rdx) ret half %r } @@ -45,7 +45,7 @@ ; CHECKNOFP16-NOT: fadd h{{[0-9]+}} ; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h ; CHECKNOFP16: ret - %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v8f16(half undef, <8 x half> %bin.rdx) + %r = call fast half @llvm.experimental.vector.reduce.unordered.fadd.f16.v8f16(<8 x half> %bin.rdx) ret half %r } @@ -55,7 +55,7 @@ ; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %bin.rdx) + %r = call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v4f32(<4 x float> %bin.rdx) ret float %r } @@ -63,7 +63,7 @@ ; CHECK-LABEL: add_D: ; CHECK: faddp d0, v0.2d ; CHECK-NEXT: ret - %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %bin.rdx) + %r = call fast double @llvm.experimental.vector.reduce.unordered.fadd.f64.v2f64(<2 x double> %bin.rdx) ret double %r } @@ -84,7 +84,7 @@ ; CHECKNOFP16-NOT: fadd h{{[0-9]+}} ; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h ; CHECKNOFP16: ret - %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v16f16(half undef, <16 x half> %bin.rdx) + %r = call fast half @llvm.experimental.vector.reduce.unordered.fadd.f16.v16f16(<16 x half> %bin.rdx) ret half %r } @@ -95,7 +95,7 @@ ; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %bin.rdx) + %r = call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v8f32(<8 x float> %bin.rdx) ret float %r } @@ -104,16 +104,16 @@ ; CHECK: fadd v0.2d, v0.2d, v1.2d ; CHECK-NEXT: faddp d0, v0.2d ; CHECK-NEXT: ret - %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %bin.rdx) + %r = call fast double @llvm.experimental.vector.reduce.unordered.fadd.f64.v4f64(<4 x double> %bin.rdx) ret double %r } ; Function Attrs: nounwind readnone -declare half @llvm.experimental.vector.reduce.fadd.f16.v4f16(half, <4 x half>) -declare half @llvm.experimental.vector.reduce.fadd.f16.v8f16(half, <8 x half>) -declare half @llvm.experimental.vector.reduce.fadd.f16.v16f16(half, <16 x half>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>) -declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>) +declare half @llvm.experimental.vector.reduce.unordered.fadd.f16.v4f16(<4 x half>) +declare half @llvm.experimental.vector.reduce.unordered.fadd.f16.v8f16(<8 x half>) +declare half @llvm.experimental.vector.reduce.unordered.fadd.f16.v16f16(<16 x half>) +declare float @llvm.experimental.vector.reduce.unordered.fadd.f32.v2f32(<2 x float>) +declare float @llvm.experimental.vector.reduce.unordered.fadd.f32.v4f32(<4 x float>) +declare float @llvm.experimental.vector.reduce.unordered.fadd.f32.v8f32(<8 x float>) +declare double @llvm.experimental.vector.reduce.unordered.fadd.f64.v2f64(<2 x double>) +declare double @llvm.experimental.vector.reduce.unordered.fadd.f64.v4f64(<4 x double>) Index: test/CodeGen/Generic/expand-experimental-reductions.ll =================================================================== --- test/CodeGen/Generic/expand-experimental-reductions.ll +++ test/CodeGen/Generic/expand-experimental-reductions.ll @@ -7,8 +7,10 @@ declare i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64>) declare i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.ordered.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.ordered.fmul.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.unordered.fadd.f32.v4f32(<4 x float>) +declare float @llvm.experimental.vector.reduce.unordered.fmul.f32.v4f32(<4 x float>) declare i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64>) declare i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64>) @@ -95,7 +97,7 @@ ; CHECK-NEXT: ret float [[TMP0]] ; entry: - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec) + %r = call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v4f32(<4 x float> %vec) ret float %r } @@ -110,7 +112,7 @@ ; CHECK-NEXT: ret float [[TMP0]] ; entry: - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec) + %r = call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v4f32(<4 x float> %vec) ret float %r } @@ -128,7 +130,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec) + %r = call float @llvm.experimental.vector.reduce.ordered.fadd.f32.v4f32(float undef, <4 x float> %vec) ret float %r } @@ -146,7 +148,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec) + %r = call float @llvm.experimental.vector.reduce.ordered.fadd.f32.v4f32(float %accum, <4 x float> %vec) ret float %r } @@ -161,7 +163,7 @@ ; CHECK-NEXT: ret float [[TMP0]] ; entry: - %r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec) + %r = call fast float @llvm.experimental.vector.reduce.unordered.fmul.f32.v4f32(<4 x float> %vec) ret float %r } @@ -176,7 +178,7 @@ ; CHECK-NEXT: ret float [[TMP0]] ; entry: - %r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec) + %r = call fast float @llvm.experimental.vector.reduce.unordered.fmul.f32.v4f32(<4 x float> %vec) ret float %r } @@ -194,7 +196,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec) + %r = call float @llvm.experimental.vector.reduce.ordered.fmul.f32.v4f32(float undef, <4 x float> %vec) ret float %r } @@ -212,7 +214,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec) + %r = call float @llvm.experimental.vector.reduce.ordered.fmul.f32.v4f32(float %accum, <4 x float> %vec) ret float %r } Index: test/CodeGen/X86/haddsub.ll =================================================================== --- test/CodeGen/X86/haddsub.ll +++ test/CodeGen/X86/haddsub.ll @@ -1355,8 +1355,8 @@ ; Repeat tests from general reductions to verify output for hoppy targets: ; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971 -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>) +declare float @llvm.experimental.vector.reduce.unordered.fadd.f32.v8f32(<8 x float>) +declare double @llvm.experimental.vector.reduce.unordered.fadd.f64.v4f64(<4 x double>) define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) { ; SSE3-SLOW-LABEL: fadd_reduce_v8f32: @@ -1398,7 +1398,7 @@ ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper ; AVX-FAST-NEXT: retq - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1) + %r = call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v8f32(<8 x float> %a1) ret float %r } @@ -1434,7 +1434,7 @@ ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper ; AVX-FAST-NEXT: retq - %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1) + %r = call fast double @llvm.experimental.vector.reduce.unordered.fadd.f64.v4f64(<4 x double> %a1) ret double %r } Index: test/CodeGen/X86/vector-reduce-fadd-fast.ll =================================================================== --- test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -35,7 +35,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v2f32(<2 x float> %a1) ret float %1 } @@ -74,7 +74,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v4f32(<4 x float> %a1) ret float %1 } @@ -121,7 +121,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v8f32(<8 x float> %a1) ret float %1 } @@ -175,7 +175,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v16f32(<16 x float> %a1) ret float %1 } @@ -209,7 +209,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v2f32(<2 x float> %a0) ret float %1 } @@ -249,7 +249,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v4f32(<4 x float> %a0) ret float %1 } @@ -297,7 +297,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v8f32(<8 x float> %a0) ret float %1 } @@ -352,7 +352,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v16f32(<16 x float> %a0) ret float %1 } @@ -386,7 +386,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v2f32(<2 x float> %a0) ret float %1 } @@ -426,7 +426,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v4f32(<4 x float> %a0) ret float %1 } @@ -474,7 +474,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v8f32(<8 x float> %a0) ret float %1 } @@ -529,7 +529,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fadd.f32.v16f32(<16 x float> %a0) ret float %1 } @@ -556,7 +556,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fadd.f64.v2f64(<2 x double> %a1) ret double %1 } @@ -586,7 +586,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fadd.f64.v4f64(<4 x double> %a1) ret double %1 } @@ -621,7 +621,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fadd.f64.v8f64(<8 x double> %a1) ret double %1 } @@ -663,7 +663,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fadd.f64.v16f64(<16 x double> %a1) ret double %1 } @@ -691,7 +691,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fadd.f64.v2f64(<2 x double> %a0) ret double %1 } @@ -722,7 +722,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fadd.f64.v4f64(<4 x double> %a0) ret double %1 } @@ -758,7 +758,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fadd.f64.v8f64(<8 x double> %a0) ret double %1 } @@ -800,7 +800,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fadd.f64.v16f64(<16 x double> %a0) ret double %1 } @@ -828,7 +828,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fadd.f64.v2f64(<2 x double> %a0) ret double %1 } @@ -859,7 +859,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fadd.f64.v4f64(<4 x double> %a0) ret double %1 } @@ -895,7 +895,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fadd.f64.v8f64(<8 x double> %a0) ret double %1 } @@ -937,16 +937,16 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fadd.f64.v16f64(<16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float, <16 x float>) +declare float @llvm.experimental.vector.reduce.unordered.fadd.f32.v2f32(<2 x float>) +declare float @llvm.experimental.vector.reduce.unordered.fadd.f32.v4f32(<4 x float>) +declare float @llvm.experimental.vector.reduce.unordered.fadd.f32.v8f32(<8 x float>) +declare float @llvm.experimental.vector.reduce.unordered.fadd.f32.v16f32(<16 x float>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double, <8 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double, <16 x double>) +declare double @llvm.experimental.vector.reduce.unordered.fadd.f64.v2f64(<2 x double>) +declare double @llvm.experimental.vector.reduce.unordered.fadd.f64.v4f64(<4 x double>) +declare double @llvm.experimental.vector.reduce.unordered.fadd.f64.v8f64(<8 x double>) +declare double @llvm.experimental.vector.reduce.unordered.fadd.f64.v16f64(<16 x double>) Index: test/CodeGen/X86/vector-reduce-fadd.ll =================================================================== --- test/CodeGen/X86/vector-reduce-fadd.ll +++ test/CodeGen/X86/vector-reduce-fadd.ll @@ -38,7 +38,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.ordered.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 } @@ -89,7 +89,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.ordered.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 } @@ -175,7 +175,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.ordered.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1) ret float %1 } @@ -326,7 +326,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.ordered.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1) ret float %1 } @@ -366,7 +366,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.ordered.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0) ret float %1 } @@ -421,7 +421,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.ordered.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0) ret float %1 } @@ -511,7 +511,7 @@ ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.ordered.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0) ret float %1 } @@ -666,7 +666,7 @@ ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.ordered.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0) ret float %1 } @@ -698,7 +698,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.ordered.fadd.f32.f32.v2f32(float undef, <2 x float> %a0) ret float %1 } @@ -745,7 +745,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.ordered.fadd.f32.f32.v4f32(float undef, <4 x float> %a0) ret float %1 } @@ -827,7 +827,7 @@ ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.ordered.fadd.f32.f32.v8f32(float undef, <8 x float> %a0) ret float %1 } @@ -974,7 +974,7 @@ ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.ordered.fadd.f32.f32.v16f32(float undef, <16 x float> %a0) ret float %1 } @@ -1003,7 +1003,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.ordered.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 } @@ -1041,7 +1041,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.ordered.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1) ret double %1 } @@ -1100,7 +1100,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.ordered.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1) ret double %1 } @@ -1201,7 +1201,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.ordered.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1) ret double %1 } @@ -1233,7 +1233,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.ordered.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0) ret double %1 } @@ -1274,7 +1274,7 @@ ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.ordered.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0) ret double %1 } @@ -1336,7 +1336,7 @@ ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.ordered.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0) ret double %1 } @@ -1439,7 +1439,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.ordered.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0) ret double %1 } @@ -1465,7 +1465,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.ordered.fadd.f64.f64.v2f64(double undef, <2 x double> %a0) ret double %1 } @@ -1500,7 +1500,7 @@ ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.ordered.fadd.f64.f64.v4f64(double undef, <4 x double> %a0) ret double %1 } @@ -1556,7 +1556,7 @@ ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.ordered.fadd.f64.f64.v8f64(double undef, <8 x double> %a0) ret double %1 } @@ -1653,16 +1653,16 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.ordered.fadd.f64.f64.v16f64(double undef, <16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float, <16 x float>) +declare float @llvm.experimental.vector.reduce.ordered.fadd.f32.f32.v2f32(float, <2 x float>) +declare float @llvm.experimental.vector.reduce.ordered.fadd.f32.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.ordered.fadd.f32.f32.v8f32(float, <8 x float>) +declare float @llvm.experimental.vector.reduce.ordered.fadd.f32.f32.v16f32(float, <16 x float>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double, <8 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double, <16 x double>) +declare double @llvm.experimental.vector.reduce.ordered.fadd.f64.f64.v2f64(double, <2 x double>) +declare double @llvm.experimental.vector.reduce.ordered.fadd.f64.f64.v4f64(double, <4 x double>) +declare double @llvm.experimental.vector.reduce.ordered.fadd.f64.f64.v8f64(double, <8 x double>) +declare double @llvm.experimental.vector.reduce.ordered.fadd.f64.f64.v16f64(double, <16 x double>) Index: test/CodeGen/X86/vector-reduce-fmul-fast.ll =================================================================== --- test/CodeGen/X86/vector-reduce-fmul-fast.ll +++ test/CodeGen/X86/vector-reduce-fmul-fast.ll @@ -35,7 +35,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fmul.f32.v2f32(<2 x float> %a1) ret float %1 } @@ -74,7 +74,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fmul.f32.v4f32(<4 x float> %a1) ret float %1 } @@ -121,7 +121,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fmul.f32.v8f32(<8 x float> %a1) ret float %1 } @@ -175,7 +175,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fmul.f32.v16f32(<16 x float> %a1) ret float %1 } @@ -209,7 +209,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fmul.f32.v2f32(<2 x float> %a0) ret float %1 } @@ -249,7 +249,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fmul.f32.v4f32(<4 x float> %a0) ret float %1 } @@ -297,7 +297,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fmul.f32.v8f32(<8 x float> %a0) ret float %1 } @@ -352,7 +352,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fmul.f32.v16f32(<16 x float> %a0) ret float %1 } @@ -386,7 +386,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fmul.f32.v2f32(<2 x float> %a0) ret float %1 } @@ -426,7 +426,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fmul.f32.v4f32(<4 x float> %a0) ret float %1 } @@ -474,7 +474,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fmul.f32.v8f32(<8 x float> %a0) ret float %1 } @@ -529,7 +529,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.unordered.fmul.f32.v16f32(<16 x float> %a0) ret float %1 } @@ -556,7 +556,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fmul.f64.v2f64(<2 x double> %a1) ret double %1 } @@ -586,7 +586,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fmul.f64.v4f64(<4 x double> %a1) ret double %1 } @@ -621,7 +621,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fmul.f64.v8f64(<8 x double> %a1) ret double %1 } @@ -663,7 +663,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fmul.f64.v16f64(<16 x double> %a1) ret double %1 } @@ -691,7 +691,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fmul.f64.v2f64(<2 x double> %a0) ret double %1 } @@ -722,7 +722,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fmul.f64.v4f64(<4 x double> %a0) ret double %1 } @@ -758,7 +758,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fmul.f64.v8f64(<8 x double> %a0) ret double %1 } @@ -800,7 +800,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fmul.f64.v16f64(<16 x double> %a0) ret double %1 } @@ -828,7 +828,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fmul.f64.v2f64(<2 x double> %a0) ret double %1 } @@ -859,7 +859,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fmul.f64.v4f64(<4 x double> %a0) ret double %1 } @@ -895,7 +895,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fmul.f64.v8f64(<8 x double> %a0) ret double %1 } @@ -937,16 +937,16 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.unordered.fmul.f64.v16f64(<16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float, <8 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float, <16 x float>) +declare float @llvm.experimental.vector.reduce.unordered.fmul.f32.v2f32(<2 x float>) +declare float @llvm.experimental.vector.reduce.unordered.fmul.f32.v4f32(<4 x float>) +declare float @llvm.experimental.vector.reduce.unordered.fmul.f32.v8f32(<8 x float>) +declare float @llvm.experimental.vector.reduce.unordered.fmul.f32.v16f32(<16 x float>) -declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double, <4 x double>) -declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double, <8 x double>) -declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double, <16 x double>) +declare double @llvm.experimental.vector.reduce.unordered.fmul.f64.v2f64(<2 x double>) +declare double @llvm.experimental.vector.reduce.unordered.fmul.f64.v4f64(<4 x double>) +declare double @llvm.experimental.vector.reduce.unordered.fmul.f64.v8f64(<8 x double>) +declare double @llvm.experimental.vector.reduce.unordered.fmul.f64.v16f64(<16 x double>) Index: test/CodeGen/X86/vector-reduce-fmul.ll =================================================================== --- test/CodeGen/X86/vector-reduce-fmul.ll +++ test/CodeGen/X86/vector-reduce-fmul.ll @@ -38,7 +38,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.ordered.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 } @@ -89,7 +89,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.ordered.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 } @@ -175,7 +175,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.ordered.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1) ret float %1 } @@ -326,7 +326,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.ordered.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1) ret float %1 } @@ -360,7 +360,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.ordered.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0) ret float %1 } @@ -407,7 +407,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.ordered.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0) ret float %1 } @@ -489,7 +489,7 @@ ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.ordered.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0) ret float %1 } @@ -636,7 +636,7 @@ ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.ordered.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0) ret float %1 } @@ -668,7 +668,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.ordered.fmul.f32.f32.v2f32(float undef, <2 x float> %a0) ret float %1 } @@ -715,7 +715,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.ordered.fmul.f32.f32.v4f32(float undef, <4 x float> %a0) ret float %1 } @@ -797,7 +797,7 @@ ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.ordered.fmul.f32.f32.v8f32(float undef, <8 x float> %a0) ret float %1 } @@ -944,7 +944,7 @@ ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.ordered.fmul.f32.f32.v16f32(float undef, <16 x float> %a0) ret float %1 } @@ -973,7 +973,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.ordered.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 } @@ -1011,7 +1011,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.ordered.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1) ret double %1 } @@ -1070,7 +1070,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.ordered.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1) ret double %1 } @@ -1171,7 +1171,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.ordered.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1) ret double %1 } @@ -1199,7 +1199,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.ordered.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0) ret double %1 } @@ -1236,7 +1236,7 @@ ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.ordered.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0) ret double %1 } @@ -1294,7 +1294,7 @@ ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.ordered.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0) ret double %1 } @@ -1392,7 +1392,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.ordered.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0) ret double %1 } @@ -1418,7 +1418,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vmulsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.ordered.fmul.f64.f64.v2f64(double undef, <2 x double> %a0) ret double %1 } @@ -1453,7 +1453,7 @@ ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.ordered.fmul.f64.f64.v4f64(double undef, <4 x double> %a0) ret double %1 } @@ -1509,7 +1509,7 @@ ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.ordered.fmul.f64.f64.v8f64(double undef, <8 x double> %a0) ret double %1 } @@ -1606,16 +1606,16 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.ordered.fmul.f64.f64.v16f64(double undef, <16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float, <8 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float, <16 x float>) +declare float @llvm.experimental.vector.reduce.ordered.fmul.f32.f32.v2f32(float, <2 x float>) +declare float @llvm.experimental.vector.reduce.ordered.fmul.f32.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.ordered.fmul.f32.f32.v8f32(float, <8 x float>) +declare float @llvm.experimental.vector.reduce.ordered.fmul.f32.f32.v16f32(float, <16 x float>) -declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double, <4 x double>) -declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double, <8 x double>) -declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double, <16 x double>) +declare double @llvm.experimental.vector.reduce.ordered.fmul.f64.f64.v2f64(double, <2 x double>) +declare double @llvm.experimental.vector.reduce.ordered.fmul.f64.f64.v4f64(double, <4 x double>) +declare double @llvm.experimental.vector.reduce.ordered.fmul.f64.f64.v8f64(double, <8 x double>) +declare double @llvm.experimental.vector.reduce.ordered.fmul.f64.f64.v16f64(double, <16 x double>)