Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -13455,37 +13455,34 @@ """""""""" The argument to this intrinsic must be a vector of integer values. -'``llvm.experimental.vector.reduce.fadd.*``' Intrinsic -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +'``llvm.experimental.vector.reduce.v2.fadd.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: """"""" :: - declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %a) - declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %acc, <2 x double> %a) + declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %start_value, <4 x float> %a) + declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %start_value, <2 x double> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.fadd.*``' intrinsics do a floating-point +The '``llvm.experimental.vector.reduce.v2.fadd.*``' intrinsics do a floating-point ``ADD`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. -If the intrinsic call has fast-math flags, then the reduction will not preserve -the associativity of an equivalent scalarized counterpart. If it does not have -fast-math flags, then the reduction will be *ordered*, implying that the -operation respects the associativity of a scalarized reduction. +If the intrinsic call has the 'reassoc' or 'fast' flags set, then the +reduction will not preserve the associativity of an equivalent scalarized +counterpart. Otherwise the reduction will be *ordered*, thus implying that +the operation respects the associativity of a scalarized reduction. Arguments: """""""""" -The first argument to this intrinsic is a scalar accumulator value, which is -only used when there are no fast-math flags attached. This argument may be undef -when fast-math flags are used. The type of the accumulator matches the -element-type of the vector input. - +The first argument to this intrinsic is a scalar start value for the reduction. +The type of the start value matches the element-type of the vector input. The second argument must be a vector of floating-point values. Examples: @@ -13493,8 +13490,8 @@ .. code-block:: llvm - %fast = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %input) ; fast reduction - %ord = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction + %unord = call reassoc float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %input) ; unordered reduction + %ord = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %start_value, <4 x float> %input) ; ordered reduction '``llvm.experimental.vector.reduce.mul.*``' Intrinsic @@ -13519,37 +13516,34 @@ """""""""" The argument to this intrinsic must be a vector of integer values. -'``llvm.experimental.vector.reduce.fmul.*``' Intrinsic -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +'``llvm.experimental.vector.reduce.v2.fmul.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: """"""" :: - declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %a) - declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %acc, <2 x double> %a) + declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %start_value, <4 x float> %a) + declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %start_value, <2 x double> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.fmul.*``' intrinsics do a floating-point +The '``llvm.experimental.vector.reduce.v2.fmul.*``' intrinsics do a floating-point ``MUL`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. -If the intrinsic call has fast-math flags, then the reduction will not preserve -the associativity of an equivalent scalarized counterpart. If it does not have -fast-math flags, then the reduction will be *ordered*, implying that the -operation respects the associativity of a scalarized reduction. +If the intrinsic call has the 'reassoc' or 'fast' flags set, then the +reduction will not preserve the associativity of an equivalent scalarized +counterpart. Otherwise the reduction will be *ordered*, thus implying that +the operation respects the associativity of a scalarized reduction. Arguments: """""""""" -The first argument to this intrinsic is a scalar accumulator value, which is -only used when there are no fast-math flags attached. This argument may be undef -when fast-math flags are used. The type of the accumulator matches the -element-type of the vector input. - +The first argument to this intrinsic is a scalar start value for the reduction. +The type of the start value matches the element-type of the vector input. The second argument must be a vector of floating-point values. Examples: @@ -13557,8 +13551,8 @@ .. code-block:: llvm - %fast = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %input) ; fast reduction - %ord = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction + %unord = call reassoc float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %input) ; unordered reduction + %ord = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %start_value, <4 x float> %input) ; ordered reduction '``llvm.experimental.vector.reduce.and.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -1057,8 +1057,8 @@ case Intrinsic::experimental_vector_reduce_and: case Intrinsic::experimental_vector_reduce_or: case Intrinsic::experimental_vector_reduce_xor: - case Intrinsic::experimental_vector_reduce_fadd: - case Intrinsic::experimental_vector_reduce_fmul: + case Intrinsic::experimental_vector_reduce_v2_fadd: + case Intrinsic::experimental_vector_reduce_v2_fmul: case Intrinsic::experimental_vector_reduce_smax: case Intrinsic::experimental_vector_reduce_smin: case Intrinsic::experimental_vector_reduce_fmax: @@ -1248,12 +1248,16 @@ case Intrinsic::experimental_vector_reduce_xor: return ConcreteTTI->getArithmeticReductionCost(Instruction::Xor, Tys[0], /*IsPairwiseForm=*/false); - case Intrinsic::experimental_vector_reduce_fadd: - return ConcreteTTI->getArithmeticReductionCost(Instruction::FAdd, Tys[0], - /*IsPairwiseForm=*/false); - case Intrinsic::experimental_vector_reduce_fmul: - return ConcreteTTI->getArithmeticReductionCost(Instruction::FMul, Tys[0], - /*IsPairwiseForm=*/false); + case Intrinsic::experimental_vector_reduce_v2_fadd: + return ConcreteTTI->getArithmeticReductionCost( + Instruction::FAdd, Tys[0], + /*IsPairwiseForm=*/false); // FIXME: This should be set to + // 'FMF.allowReassoc()' + case Intrinsic::experimental_vector_reduce_v2_fmul: + return ConcreteTTI->getArithmeticReductionCost( + Instruction::FMul, Tys[0], + /*IsPairwiseForm=*/false); // FIXME: This should be set to + // 'FMF.allowReassoc()' case Intrinsic::experimental_vector_reduce_smax: case Intrinsic::experimental_vector_reduce_smin: case Intrinsic::experimental_vector_reduce_fmax: Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -1122,14 +1122,14 @@ //===------------------------ Reduction Intrinsics ------------------------===// // -def int_experimental_vector_reduce_fadd : Intrinsic<[llvm_anyfloat_ty], - [LLVMMatchType<0>, - llvm_anyvector_ty], - [IntrNoMem]>; -def int_experimental_vector_reduce_fmul : Intrinsic<[llvm_anyfloat_ty], - [LLVMMatchType<0>, - llvm_anyvector_ty], - [IntrNoMem]>; +def int_experimental_vector_reduce_v2_fadd : Intrinsic<[llvm_anyfloat_ty], + [LLVMMatchType<0>, + llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_v2_fmul : Intrinsic<[llvm_anyfloat_ty], + [LLVMMatchType<0>, + llvm_anyvector_ty], + [IntrNoMem]>; def int_experimental_vector_reduce_add : Intrinsic<[llvm_anyint_ty], [llvm_anyvector_ty], [IntrNoMem]>; Index: lib/CodeGen/ExpandReductions.cpp =================================================================== --- lib/CodeGen/ExpandReductions.cpp +++ lib/CodeGen/ExpandReductions.cpp @@ -29,9 +29,9 @@ unsigned getOpcode(Intrinsic::ID ID) { switch (ID) { - case Intrinsic::experimental_vector_reduce_fadd: + case Intrinsic::experimental_vector_reduce_v2_fadd: return Instruction::FAdd; - case Intrinsic::experimental_vector_reduce_fmul: + case Intrinsic::experimental_vector_reduce_v2_fmul: return Instruction::FMul; case Intrinsic::experimental_vector_reduce_add: return Instruction::Add; @@ -90,12 +90,11 @@ auto ID = II->getIntrinsicID(); auto MRK = RecurrenceDescriptor::MRK_Invalid; switch (ID) { - case Intrinsic::experimental_vector_reduce_fadd: - case Intrinsic::experimental_vector_reduce_fmul: + case Intrinsic::experimental_vector_reduce_v2_fadd: + case Intrinsic::experimental_vector_reduce_v2_fmul: // FMFs must be attached to the call, otherwise it's an ordered reduction // and it can't be handled by generating a shuffle sequence. - if (!II->getFastMathFlags().isFast()) - IsOrdered = true; + IsOrdered = !II->getFastMathFlags().allowReassoc(); Acc = II->getArgOperand(0); Vec = II->getArgOperand(1); break; Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6646,8 +6646,8 @@ LowerDeoptimizeCall(&I); return nullptr; - case Intrinsic::experimental_vector_reduce_fadd: - case Intrinsic::experimental_vector_reduce_fmul: + case Intrinsic::experimental_vector_reduce_v2_fadd: + case Intrinsic::experimental_vector_reduce_v2_fmul: case Intrinsic::experimental_vector_reduce_add: case Intrinsic::experimental_vector_reduce_mul: case Intrinsic::experimental_vector_reduce_and: @@ -8688,15 +8688,17 @@ FMF = I.getFastMathFlags(); switch (Intrinsic) { - case Intrinsic::experimental_vector_reduce_fadd: - if (FMF.isFast()) - Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2); + case Intrinsic::experimental_vector_reduce_v2_fadd: + if (FMF.allowReassoc()) + Res = DAG.getNode(ISD::FADD, dl, VT, Op1, + DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2)); else Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2); break; - case Intrinsic::experimental_vector_reduce_fmul: - if (FMF.isFast()) - Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2); + case Intrinsic::experimental_vector_reduce_v2_fmul: + if (FMF.allowReassoc()) + Res = DAG.getNode(ISD::FMUL, dl, VT, Op1, + DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2)); else Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2); break; Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -605,6 +605,26 @@ } break; } + case 'e': { + SmallVector Groups; + Regex R("^experimental.vector.reduce.([a-z]+)\\.[fi][0-9]+"); + if (R.match(Name, &Groups)) { + Intrinsic::ID ID = Intrinsic::not_intrinsic; + if (Groups[1] == "fadd") + ID = Intrinsic::experimental_vector_reduce_v2_fadd; + if (Groups[1] == "fmul") + ID = Intrinsic::experimental_vector_reduce_v2_fmul; + + if (ID != Intrinsic::not_intrinsic) { + rename(F); + auto Args = F->getFunctionType()->params(); + Type *Tys[] = {F->getFunctionType()->getReturnType(), Args[1]}; + NewFn = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + return true; + } + } + break; + } case 'i': case 'l': { bool IsLifetimeStart = Name.startswith("lifetime.start"); @@ -3448,7 +3468,28 @@ DefaultCase(); return; } - + case Intrinsic::experimental_vector_reduce_v2_fmul: { + SmallVector Args; + if (CI->isFast()) + Args.push_back(ConstantFP::get(CI->getOperand(0)->getType(), 1.0)); + else + Args.push_back(CI->getOperand(0)); + Args.push_back(CI->getOperand(1)); + NewCall = Builder.CreateCall(NewFn, Args); + cast(NewCall)->copyFastMathFlags(CI); + break; + } + case Intrinsic::experimental_vector_reduce_v2_fadd: { + SmallVector Args; + if (CI->isFast()) + Args.push_back(Constant::getNullValue(CI->getOperand(0)->getType())); + else + Args.push_back(CI->getOperand(0)); + Args.push_back(CI->getOperand(1)); + NewCall = Builder.CreateCall(NewFn, Args); + cast(NewCall)->copyFastMathFlags(CI); + break; + } case Intrinsic::arm_neon_vld1: case Intrinsic::arm_neon_vld2: case Intrinsic::arm_neon_vld3: Index: lib/IR/IRBuilder.cpp =================================================================== --- lib/IR/IRBuilder.cpp +++ lib/IR/IRBuilder.cpp @@ -323,7 +323,7 @@ Value *Ops[] = {Acc, Src}; Type *Tys[] = {Acc->getType(), Src->getType()}; auto Decl = Intrinsic::getDeclaration( - M, Intrinsic::experimental_vector_reduce_fadd, Tys); + M, Intrinsic::experimental_vector_reduce_v2_fadd, Tys); return createCallHelper(Decl, Ops, this); } @@ -332,7 +332,7 @@ Value *Ops[] = {Acc, Src}; Type *Tys[] = {Acc->getType(), Src->getType()}; auto Decl = Intrinsic::getDeclaration( - M, Intrinsic::experimental_vector_reduce_fmul, Tys); + M, Intrinsic::experimental_vector_reduce_v2_fmul, Tys); return createCallHelper(Decl, Ops, this); } Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -804,13 +804,9 @@ ArrayRef RedOps) { assert(isa(Src->getType()) && "Type must be a vector"); - Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType()); std::function BuildFunc; using RD = RecurrenceDescriptor; RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid; - // TODO: Support creating ordered reductions. - FastMathFlags FMFFast; - FMFFast.setFast(); switch (Opcode) { case Instruction::Add: @@ -830,15 +826,17 @@ break; case Instruction::FAdd: BuildFunc = [&]() { - auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src); - cast(Rdx)->setFastMathFlags(FMFFast); + auto Rdx = Builder.CreateFAddReduce( + Constant::getNullValue(Src->getType()->getVectorElementType()), Src); + cast(Rdx)->setFastMathFlags(FMF); return Rdx; }; break; case Instruction::FMul: BuildFunc = [&]() { - auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src); - cast(Rdx)->setFastMathFlags(FMFFast); + Type *Ty = Src->getType()->getVectorElementType(); + auto Rdx = Builder.CreateFMulReduce(ConstantFP::get(Ty, 1.0), Src); + cast(Rdx)->setFastMathFlags(FMF); return Rdx; }; break; Index: test/Assembler/invalid-vecreduce.ll =================================================================== --- test/Assembler/invalid-vecreduce.ll +++ test/Assembler/invalid-vecreduce.ll @@ -1,34 +1,34 @@ ; RUN: not opt -S < %s 2>&1 | FileCheck %s ; CHECK: Intrinsic has incorrect argument type! -; CHECK-NEXT: float (double, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64 +; CHECK-NEXT: float (double, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64 define float @fadd_invalid_scalar_res(double %acc, <2 x double> %in) { - %res = call float @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in) + %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64(double %acc, <2 x double> %in) ret float %res } ; CHECK: Intrinsic has incorrect argument type! -; CHECK-NEXT: double (float, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64 +; CHECK-NEXT: double (float, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64 define double @fadd_invalid_scalar_start(float %acc, <2 x double> %in) { - %res = call double @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in) + %res = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64(float %acc, <2 x double> %in) ret double %res } ; CHECK: Intrinsic has incorrect argument type! -; CHECK-NEXT: <2 x double> (double, <2 x double>)* @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64 +; CHECK-NEXT: <2 x double> (double, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64 define <2 x double> @fadd_invalid_vector_res(double %acc, <2 x double> %in) { - %res = call <2 x double> @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in) + %res = call <2 x double> @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in) ret <2 x double> %res } ; CHECK: Intrinsic has incorrect argument type! -; CHECK-NEXT: double (<2 x double>, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64 +; CHECK-NEXT: double (<2 x double>, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64 define double @fadd_invalid_vector_start(<2 x double> %in, <2 x double> %acc) { - %res = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in) + %res = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in) ret double %res } -declare float @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in) -declare double @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in) -declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in) -declare <2 x double> @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64(double %acc, <2 x double> %in) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64(float %acc, <2 x double> %in) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in) +declare <2 x double> @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in) Index: test/Bitcode/upgrade-vecreduce-intrinsics.ll =================================================================== --- /dev/null +++ test/Bitcode/upgrade-vecreduce-intrinsics.ll @@ -0,0 +1,64 @@ +; RUN: opt -S < %s | FileCheck %s +; RUN: llvm-dis < %s.bc | FileCheck %s + +define float @fadd_acc(<4 x float> %in, float %acc) { +; CHECK-LABEL: @fadd_acc +; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %acc, <4 x float> %in) + %res = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %in) + ret float %res +} + +define float @fadd_undef(<4 x float> %in) { +; CHECK-LABEL: @fadd_undef +; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %in) + %res = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %in) + ret float %res +} + +define float @fadd_fast_acc(<4 x float> %in, float %acc) { +; CHECK-LABEL: @fadd_fast_acc +; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %in) + %res = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %in) + ret float %res +} + +define float @fadd_fast_undef(<4 x float> %in) { +; CHECK-LABEL: @fadd_fast_undef +; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %in) + %res = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %in) + ret float %res +} + +define float @fmul_acc(<4 x float> %in, float %acc) { +; CHECK-LABEL: @fmul_acc +; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %acc, <4 x float> %in) + %res = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %in) + ret float %res +} + +define float @fmul_undef(<4 x float> %in) { +; CHECK-LABEL: @fmul_undef +; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %in) + %res = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %in) + ret float %res +} + +define float @fmul_fast_acc(<4 x float> %in, float %acc) { +; CHECK-LABEL: @fmul_fast_acc +; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %in) + %res = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %in) + ret float %res +} + +define float @fmul_fast_undef(<4 x float> %in) { +; CHECK-LABEL: @fmul_fast_undef +; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %in) + %res = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %in) + ret float %res +} + +declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +; CHECK: declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) + +declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>) +; CHECK: declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>) Index: test/CodeGen/AArch64/vecreduce-fadd-legalization.ll =================================================================== --- test/CodeGen/AArch64/vecreduce-fadd-legalization.ll +++ test/CodeGen/AArch64/vecreduce-fadd-legalization.ll @@ -1,20 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK -declare half @llvm.experimental.vector.reduce.fadd.f16.v1f16(half, <1 x half>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v1f32(float, <1 x float>) -declare double @llvm.experimental.vector.reduce.fadd.f64.v1f64(double, <1 x double>) -declare fp128 @llvm.experimental.vector.reduce.fadd.f128.v1f128(fp128, <1 x fp128>) +declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half, <1 x half>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float, <1 x float>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double, <1 x double>) +declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128, <1 x fp128>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v3f32(float, <3 x float>) -declare fp128 @llvm.experimental.vector.reduce.fadd.f128.v2f128(fp128, <2 x fp128>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float, <16 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float, <3 x float>) +declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128, <2 x fp128>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>) define half @test_v1f16(<1 x half> %a) nounwind { ; CHECK-LABEL: test_v1f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call fast nnan half @llvm.experimental.vector.reduce.fadd.f16.v1f16(half 0.0, <1 x half> %a) + %b = call fast nnan half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half 0.0, <1 x half> %a) ret half %b } @@ -24,7 +24,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: ret - %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v1f32(float 0.0, <1 x float> %a) + %b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float 0.0, <1 x float> %a) ret float %b } @@ -32,7 +32,7 @@ ; CHECK-LABEL: test_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call fast nnan double @llvm.experimental.vector.reduce.fadd.f64.v1f64(double 0.0, <1 x double> %a) + %b = call fast nnan double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double 0.0, <1 x double> %a) ret double %b } @@ -40,7 +40,7 @@ ; CHECK-LABEL: test_v1f128: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call fast nnan fp128 @llvm.experimental.vector.reduce.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a) + %b = call fast nnan fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a) ret fp128 %b } @@ -53,7 +53,7 @@ ; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret - %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v3f32(float 0.0, <3 x float> %a) + %b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float 0.0, <3 x float> %a) ret float %b } @@ -64,7 +64,7 @@ ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret - %b = call fast nnan fp128 @llvm.experimental.vector.reduce.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a) + %b = call fast nnan fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a) ret fp128 %b } @@ -78,6 +78,6 @@ ; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret - %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a) + %b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a) ret float %b } Index: test/CodeGen/AArch64/vecreduce-fadd.ll =================================================================== --- test/CodeGen/AArch64/vecreduce-fadd.ll +++ test/CodeGen/AArch64/vecreduce-fadd.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: add_HalfS: ; CHECK: faddp s0, v0.2s ; CHECK-NEXT: ret - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %bin.rdx) + %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %bin.rdx) ret float %r } @@ -23,7 +23,7 @@ ; CHECKNOFP16-NOT: fadd h{{[0-9]+}} ; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h ; CHECKNOFP16: ret - %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v4f16(half undef, <4 x half> %bin.rdx) + %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half 0.0, <4 x half> %bin.rdx) ret half %r } @@ -45,7 +45,7 @@ ; CHECKNOFP16-NOT: fadd h{{[0-9]+}} ; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h ; CHECKNOFP16: ret - %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v8f16(half undef, <8 x half> %bin.rdx) + %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half 0.0, <8 x half> %bin.rdx) ret half %r } @@ -55,7 +55,7 @@ ; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %bin.rdx) + %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %bin.rdx) ret float %r } @@ -63,7 +63,7 @@ ; CHECK-LABEL: add_D: ; CHECK: faddp d0, v0.2d ; CHECK-NEXT: ret - %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %bin.rdx) + %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %bin.rdx) ret double %r } @@ -84,7 +84,7 @@ ; CHECKNOFP16-NOT: fadd h{{[0-9]+}} ; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h ; CHECKNOFP16: ret - %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v16f16(half undef, <16 x half> %bin.rdx) + %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half 0.0, <16 x half> %bin.rdx) ret half %r } @@ -95,7 +95,7 @@ ; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %bin.rdx) + %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %bin.rdx) ret float %r } @@ -104,16 +104,16 @@ ; CHECK: fadd v0.2d, v0.2d, v1.2d ; CHECK-NEXT: faddp d0, v0.2d ; CHECK-NEXT: ret - %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %bin.rdx) + %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %bin.rdx) ret double %r } ; Function Attrs: nounwind readnone -declare half @llvm.experimental.vector.reduce.fadd.f16.v4f16(half, <4 x half>) -declare half @llvm.experimental.vector.reduce.fadd.f16.v8f16(half, <8 x half>) -declare half @llvm.experimental.vector.reduce.fadd.f16.v16f16(half, <16 x half>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>) -declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>) +declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half, <4 x half>) +declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half, <8 x half>) +declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half, <16 x half>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>) Index: test/CodeGen/Generic/expand-experimental-reductions.ll =================================================================== --- test/CodeGen/Generic/expand-experimental-reductions.ll +++ test/CodeGen/Generic/expand-experimental-reductions.ll @@ -7,8 +7,8 @@ declare i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64>) declare i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>) declare i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64>) declare i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64>) @@ -95,7 +95,7 @@ ; CHECK-NEXT: ret float [[TMP0]] ; entry: - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec) + %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %vec) ret float %r } @@ -110,7 +110,7 @@ ; CHECK-NEXT: ret float [[TMP0]] ; entry: - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec) + %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %accum, <4 x float> %vec) ret float %r } @@ -128,7 +128,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec) + %r = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %vec) ret float %r } @@ -146,7 +146,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec) + %r = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %accum, <4 x float> %vec) ret float %r } @@ -161,7 +161,7 @@ ; CHECK-NEXT: ret float [[TMP0]] ; entry: - %r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec) + %r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %vec) ret float %r } @@ -176,7 +176,7 @@ ; CHECK-NEXT: ret float [[TMP0]] ; entry: - %r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec) + %r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %accum, <4 x float> %vec) ret float %r } @@ -194,7 +194,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec) + %r = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %vec) ret float %r } @@ -212,7 +212,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec) + %r = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %accum, <4 x float> %vec) ret float %r } Index: test/CodeGen/X86/haddsub.ll =================================================================== --- test/CodeGen/X86/haddsub.ll +++ test/CodeGen/X86/haddsub.ll @@ -1355,8 +1355,8 @@ ; Repeat tests from general reductions to verify output for hoppy targets: ; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971 -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float, <8 x float>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double, <4 x double>) define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) { ; SSE3-SLOW-LABEL: fadd_reduce_v8f32: @@ -1398,7 +1398,7 @@ ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper ; AVX-FAST-NEXT: retq - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1) + %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1) ret float %r } @@ -1434,7 +1434,7 @@ ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper ; AVX-FAST-NEXT: retq - %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1) + %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1) ret double %r } Index: test/CodeGen/X86/vector-reduce-fadd-fast.ll =================================================================== --- test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -35,7 +35,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 } @@ -74,7 +74,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 } @@ -121,7 +121,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1) ret float %1 } @@ -175,7 +175,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1) ret float %1 } @@ -209,7 +209,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0) ret float %1 } @@ -249,7 +249,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0) ret float %1 } @@ -297,7 +297,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0) ret float %1 } @@ -352,7 +352,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0) ret float %1 } @@ -386,7 +386,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0) ret float %1 } @@ -426,7 +426,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0) ret float %1 } @@ -474,7 +474,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0) ret float %1 } @@ -529,7 +529,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0) ret float %1 } @@ -556,7 +556,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 } @@ -586,7 +586,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1) ret double %1 } @@ -621,7 +621,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1) ret double %1 } @@ -663,7 +663,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1) ret double %1 } @@ -691,7 +691,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0) ret double %1 } @@ -722,7 +722,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0) ret double %1 } @@ -758,7 +758,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0) ret double %1 } @@ -800,7 +800,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0) ret double %1 } @@ -828,7 +828,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0) ret double %1 } @@ -859,7 +859,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0) ret double %1 } @@ -895,7 +895,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0) ret double %1 } @@ -937,16 +937,16 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float, <16 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v2f32(float, <2 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float, <8 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v16f32(float, <16 x float>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double, <8 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double, <16 x double>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v2f64(double, <2 x double>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double, <4 x double>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v8f64(double, <8 x double>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v16f64(double, <16 x double>) Index: test/CodeGen/X86/vector-reduce-fadd.ll =================================================================== --- test/CodeGen/X86/vector-reduce-fadd.ll +++ test/CodeGen/X86/vector-reduce-fadd.ll @@ -38,7 +38,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 } @@ -89,7 +89,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 } @@ -175,7 +175,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1) ret float %1 } @@ -326,7 +326,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1) ret float %1 } @@ -366,7 +366,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0) ret float %1 } @@ -421,7 +421,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0) ret float %1 } @@ -511,7 +511,7 @@ ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0) ret float %1 } @@ -666,7 +666,7 @@ ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0) ret float %1 } @@ -698,7 +698,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v2f32(float undef, <2 x float> %a0) ret float %1 } @@ -745,7 +745,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v4f32(float undef, <4 x float> %a0) ret float %1 } @@ -827,7 +827,7 @@ ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float undef, <8 x float> %a0) ret float %1 } @@ -974,7 +974,7 @@ ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v16f32(float undef, <16 x float> %a0) ret float %1 } @@ -1003,7 +1003,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 } @@ -1041,7 +1041,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1) ret double %1 } @@ -1100,7 +1100,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1) ret double %1 } @@ -1201,7 +1201,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1) ret double %1 } @@ -1233,7 +1233,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0) ret double %1 } @@ -1274,7 +1274,7 @@ ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0) ret double %1 } @@ -1336,7 +1336,7 @@ ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0) ret double %1 } @@ -1439,7 +1439,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0) ret double %1 } @@ -1465,7 +1465,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v2f64(double undef, <2 x double> %a0) ret double %1 } @@ -1500,7 +1500,7 @@ ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double undef, <4 x double> %a0) ret double %1 } @@ -1556,7 +1556,7 @@ ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v8f64(double undef, <8 x double> %a0) ret double %1 } @@ -1653,16 +1653,16 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v16f64(double undef, <16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float, <16 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v2f32(float, <2 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v8f32(float, <8 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f32.v16f32(float, <16 x float>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double, <8 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double, <16 x double>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v2f64(double, <2 x double>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v4f64(double, <4 x double>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v8f64(double, <8 x double>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f64.v16f64(double, <16 x double>) Index: test/CodeGen/X86/vector-reduce-fmul-fast.ll =================================================================== --- test/CodeGen/X86/vector-reduce-fmul-fast.ll +++ test/CodeGen/X86/vector-reduce-fmul-fast.ll @@ -35,7 +35,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 } @@ -74,7 +74,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 } @@ -121,7 +121,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1) ret float %1 } @@ -175,7 +175,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1) ret float %1 } @@ -209,7 +209,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0) ret float %1 } @@ -249,7 +249,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0) ret float %1 } @@ -297,7 +297,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0) ret float %1 } @@ -352,7 +352,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0) ret float %1 } @@ -386,7 +386,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0) ret float %1 } @@ -426,7 +426,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0) ret float %1 } @@ -474,7 +474,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0) ret float %1 } @@ -529,7 +529,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0) ret float %1 } @@ -556,7 +556,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 } @@ -586,7 +586,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1) ret double %1 } @@ -621,7 +621,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1) ret double %1 } @@ -663,7 +663,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1) ret double %1 } @@ -691,7 +691,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0) ret double %1 } @@ -722,7 +722,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0) ret double %1 } @@ -758,7 +758,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0) ret double %1 } @@ -800,7 +800,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0) ret double %1 } @@ -828,7 +828,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0) ret double %1 } @@ -859,7 +859,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0) ret double %1 } @@ -895,7 +895,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0) ret double %1 } @@ -937,16 +937,16 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float, <8 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float, <16 x float>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v2f32(float, <2 x float>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v8f32(float, <8 x float>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v16f32(float, <16 x float>) -declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double, <4 x double>) -declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double, <8 x double>) -declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double, <16 x double>) +declare double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v2f64(double, <2 x double>) +declare double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v4f64(double, <4 x double>) +declare double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v8f64(double, <8 x double>) +declare double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v16f64(double, <16 x double>) Index: test/CodeGen/X86/vector-reduce-fmul.ll =================================================================== --- test/CodeGen/X86/vector-reduce-fmul.ll +++ test/CodeGen/X86/vector-reduce-fmul.ll @@ -38,7 +38,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 } @@ -89,7 +89,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 } @@ -175,7 +175,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1) ret float %1 } @@ -326,7 +326,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1) ret float %1 } @@ -360,7 +360,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0) ret float %1 } @@ -407,7 +407,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0) ret float %1 } @@ -489,7 +489,7 @@ ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0) ret float %1 } @@ -636,7 +636,7 @@ ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0) ret float %1 } @@ -668,7 +668,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v2f32(float undef, <2 x float> %a0) ret float %1 } @@ -715,7 +715,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v4f32(float undef, <4 x float> %a0) ret float %1 } @@ -797,7 +797,7 @@ ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v8f32(float undef, <8 x float> %a0) ret float %1 } @@ -944,7 +944,7 @@ ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v16f32(float undef, <16 x float> %a0) ret float %1 } @@ -973,7 +973,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 } @@ -1011,7 +1011,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1) ret double %1 } @@ -1070,7 +1070,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1) ret double %1 } @@ -1171,7 +1171,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1) ret double %1 } @@ -1199,7 +1199,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0) ret double %1 } @@ -1236,7 +1236,7 @@ ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0) ret double %1 } @@ -1294,7 +1294,7 @@ ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0) ret double %1 } @@ -1392,7 +1392,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0) ret double %1 } @@ -1418,7 +1418,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vmulsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v2f64(double undef, <2 x double> %a0) ret double %1 } @@ -1453,7 +1453,7 @@ ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v4f64(double undef, <4 x double> %a0) ret double %1 } @@ -1509,7 +1509,7 @@ ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v8f64(double undef, <8 x double> %a0) ret double %1 } @@ -1606,16 +1606,16 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v16f64(double undef, <16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float, <8 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float, <16 x float>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v2f32(float, <2 x float>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v8f32(float, <8 x float>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.f32.v16f32(float, <16 x float>) -declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double, <4 x double>) -declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double, <8 x double>) -declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double, <16 x double>) +declare double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v2f64(double, <2 x double>) +declare double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v4f64(double, <4 x double>) +declare double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v8f64(double, <8 x double>) +declare double @llvm.experimental.vector.reduce.v2.fmul.f64.f64.v16f64(double, <16 x double>)