Index: llvm/trunk/docs/LangRef.rst =================================================================== --- llvm/trunk/docs/LangRef.rst +++ llvm/trunk/docs/LangRef.rst @@ -13733,37 +13733,34 @@ """""""""" The argument to this intrinsic must be a vector of integer values. -'``llvm.experimental.vector.reduce.fadd.*``' Intrinsic -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +'``llvm.experimental.vector.reduce.v2.fadd.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: """"""" :: - declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %a) - declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %acc, <2 x double> %a) + declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %start_value, <4 x float> %a) + declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %start_value, <2 x double> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.fadd.*``' intrinsics do a floating-point +The '``llvm.experimental.vector.reduce.v2.fadd.*``' intrinsics do a floating-point ``ADD`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. -If the intrinsic call has fast-math flags, then the reduction will not preserve -the associativity of an equivalent scalarized counterpart. If it does not have -fast-math flags, then the reduction will be *ordered*, implying that the -operation respects the associativity of a scalarized reduction. +If the intrinsic call has the 'reassoc' or 'fast' flags set, then the +reduction will not preserve the associativity of an equivalent scalarized +counterpart. Otherwise the reduction will be *ordered*, thus implying that +the operation respects the associativity of a scalarized reduction. Arguments: """""""""" -The first argument to this intrinsic is a scalar accumulator value, which is -only used when there are no fast-math flags attached. This argument may be undef -when fast-math flags are used. The type of the accumulator matches the -element-type of the vector input. - +The first argument to this intrinsic is a scalar start value for the reduction. +The type of the start value matches the element-type of the vector input. The second argument must be a vector of floating-point values. Examples: @@ -13771,8 +13768,8 @@ .. code-block:: llvm - %fast = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %input) ; fast reduction - %ord = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction + %unord = call reassoc float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %input) ; unordered reduction + %ord = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %start_value, <4 x float> %input) ; ordered reduction '``llvm.experimental.vector.reduce.mul.*``' Intrinsic @@ -13797,37 +13794,34 @@ """""""""" The argument to this intrinsic must be a vector of integer values. -'``llvm.experimental.vector.reduce.fmul.*``' Intrinsic -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +'``llvm.experimental.vector.reduce.v2.fmul.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: """"""" :: - declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %a) - declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %acc, <2 x double> %a) + declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %start_value, <4 x float> %a) + declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %start_value, <2 x double> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.fmul.*``' intrinsics do a floating-point +The '``llvm.experimental.vector.reduce.v2.fmul.*``' intrinsics do a floating-point ``MUL`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. -If the intrinsic call has fast-math flags, then the reduction will not preserve -the associativity of an equivalent scalarized counterpart. If it does not have -fast-math flags, then the reduction will be *ordered*, implying that the -operation respects the associativity of a scalarized reduction. +If the intrinsic call has the 'reassoc' or 'fast' flags set, then the +reduction will not preserve the associativity of an equivalent scalarized +counterpart. Otherwise the reduction will be *ordered*, thus implying that +the operation respects the associativity of a scalarized reduction. Arguments: """""""""" -The first argument to this intrinsic is a scalar accumulator value, which is -only used when there are no fast-math flags attached. This argument may be undef -when fast-math flags are used. The type of the accumulator matches the -element-type of the vector input. - +The first argument to this intrinsic is a scalar start value for the reduction. +The type of the start value matches the element-type of the vector input. The second argument must be a vector of floating-point values. Examples: @@ -13835,8 +13829,8 @@ .. code-block:: llvm - %fast = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %input) ; fast reduction - %ord = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction + %unord = call reassoc float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %input) ; unordered reduction + %ord = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %start_value, <4 x float> %input) ; ordered reduction '``llvm.experimental.vector.reduce.and.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Index: llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h @@ -1070,8 +1070,8 @@ case Intrinsic::experimental_vector_reduce_and: case Intrinsic::experimental_vector_reduce_or: case Intrinsic::experimental_vector_reduce_xor: - case Intrinsic::experimental_vector_reduce_fadd: - case Intrinsic::experimental_vector_reduce_fmul: + case Intrinsic::experimental_vector_reduce_v2_fadd: + case Intrinsic::experimental_vector_reduce_v2_fmul: case Intrinsic::experimental_vector_reduce_smax: case Intrinsic::experimental_vector_reduce_smin: case Intrinsic::experimental_vector_reduce_fmax: @@ -1261,12 +1261,16 @@ case Intrinsic::experimental_vector_reduce_xor: return ConcreteTTI->getArithmeticReductionCost(Instruction::Xor, Tys[0], /*IsPairwiseForm=*/false); - case Intrinsic::experimental_vector_reduce_fadd: - return ConcreteTTI->getArithmeticReductionCost(Instruction::FAdd, Tys[0], - /*IsPairwiseForm=*/false); - case Intrinsic::experimental_vector_reduce_fmul: - return ConcreteTTI->getArithmeticReductionCost(Instruction::FMul, Tys[0], - /*IsPairwiseForm=*/false); + case Intrinsic::experimental_vector_reduce_v2_fadd: + return ConcreteTTI->getArithmeticReductionCost( + Instruction::FAdd, Tys[0], + /*IsPairwiseForm=*/false); // FIXME: Add new flag for cost of strict + // reductions. + case Intrinsic::experimental_vector_reduce_v2_fmul: + return ConcreteTTI->getArithmeticReductionCost( + Instruction::FMul, Tys[0], + /*IsPairwiseForm=*/false); // FIXME: Add new flag for cost of strict + // reductions. case Intrinsic::experimental_vector_reduce_smax: case Intrinsic::experimental_vector_reduce_smin: case Intrinsic::experimental_vector_reduce_fmax: Index: llvm/trunk/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/trunk/include/llvm/IR/Intrinsics.td +++ llvm/trunk/include/llvm/IR/Intrinsics.td @@ -1140,14 +1140,14 @@ //===------------------------ Reduction Intrinsics ------------------------===// // -def int_experimental_vector_reduce_fadd : Intrinsic<[llvm_anyfloat_ty], - [LLVMMatchType<0>, - llvm_anyvector_ty], - [IntrNoMem]>; -def int_experimental_vector_reduce_fmul : Intrinsic<[llvm_anyfloat_ty], - [LLVMMatchType<0>, - llvm_anyvector_ty], - [IntrNoMem]>; +def int_experimental_vector_reduce_v2_fadd : Intrinsic<[llvm_anyfloat_ty], + [LLVMMatchType<0>, + llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_v2_fmul : Intrinsic<[llvm_anyfloat_ty], + [LLVMMatchType<0>, + llvm_anyvector_ty], + [IntrNoMem]>; def int_experimental_vector_reduce_add : Intrinsic<[llvm_anyint_ty], [llvm_anyvector_ty], [IntrNoMem]>; Index: llvm/trunk/lib/CodeGen/ExpandReductions.cpp =================================================================== --- llvm/trunk/lib/CodeGen/ExpandReductions.cpp +++ llvm/trunk/lib/CodeGen/ExpandReductions.cpp @@ -29,9 +29,9 @@ unsigned getOpcode(Intrinsic::ID ID) { switch (ID) { - case Intrinsic::experimental_vector_reduce_fadd: + case Intrinsic::experimental_vector_reduce_v2_fadd: return Instruction::FAdd; - case Intrinsic::experimental_vector_reduce_fmul: + case Intrinsic::experimental_vector_reduce_v2_fmul: return Instruction::FMul; case Intrinsic::experimental_vector_reduce_add: return Instruction::Add; @@ -83,22 +83,33 @@ Worklist.push_back(II); for (auto *II : Worklist) { + if (!TTI->shouldExpandReduction(II)) + continue; + + FastMathFlags FMF = + isa(II) ? II->getFastMathFlags() : FastMathFlags{}; + Intrinsic::ID ID = II->getIntrinsicID(); + RecurrenceDescriptor::MinMaxRecurrenceKind MRK = getMRK(ID); + + Value *Rdx = nullptr; IRBuilder<> Builder(II); - bool IsOrdered = false; - Value *Acc = nullptr; - Value *Vec = nullptr; - auto ID = II->getIntrinsicID(); - auto MRK = RecurrenceDescriptor::MRK_Invalid; + IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); + Builder.setFastMathFlags(FMF); switch (ID) { - case Intrinsic::experimental_vector_reduce_fadd: - case Intrinsic::experimental_vector_reduce_fmul: + case Intrinsic::experimental_vector_reduce_v2_fadd: + case Intrinsic::experimental_vector_reduce_v2_fmul: { // FMFs must be attached to the call, otherwise it's an ordered reduction // and it can't be handled by generating a shuffle sequence. - if (!II->getFastMathFlags().isFast()) - IsOrdered = true; - Acc = II->getArgOperand(0); - Vec = II->getArgOperand(1); - break; + Value *Acc = II->getArgOperand(0); + Value *Vec = II->getArgOperand(1); + if (!FMF.allowReassoc()) + Rdx = getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), MRK); + else { + Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); + Rdx = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(ID), + Acc, Rdx, "bin.rdx"); + } + } break; case Intrinsic::experimental_vector_reduce_add: case Intrinsic::experimental_vector_reduce_mul: case Intrinsic::experimental_vector_reduce_and: @@ -109,23 +120,13 @@ case Intrinsic::experimental_vector_reduce_umax: case Intrinsic::experimental_vector_reduce_umin: case Intrinsic::experimental_vector_reduce_fmax: - case Intrinsic::experimental_vector_reduce_fmin: - Vec = II->getArgOperand(0); - MRK = getMRK(ID); - break; + case Intrinsic::experimental_vector_reduce_fmin: { + Value *Vec = II->getArgOperand(0); + Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); + } break; default: continue; } - if (!TTI->shouldExpandReduction(II)) - continue; - // Propagate FMF using the builder. - FastMathFlags FMF = - isa(II) ? II->getFastMathFlags() : FastMathFlags{}; - IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); - Builder.setFastMathFlags(FMF); - Value *Rdx = - IsOrdered ? getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), MRK) - : getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); II->replaceAllUsesWith(Rdx); II->eraseFromParent(); Changed = true; Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6736,8 +6736,8 @@ LowerDeoptimizeCall(&I); return; - case Intrinsic::experimental_vector_reduce_fadd: - case Intrinsic::experimental_vector_reduce_fmul: + case Intrinsic::experimental_vector_reduce_v2_fadd: + case Intrinsic::experimental_vector_reduce_v2_fmul: case Intrinsic::experimental_vector_reduce_add: case Intrinsic::experimental_vector_reduce_mul: case Intrinsic::experimental_vector_reduce_and: @@ -8795,15 +8795,17 @@ FMF = I.getFastMathFlags(); switch (Intrinsic) { - case Intrinsic::experimental_vector_reduce_fadd: - if (FMF.isFast()) - Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2); + case Intrinsic::experimental_vector_reduce_v2_fadd: + if (FMF.allowReassoc()) + Res = DAG.getNode(ISD::FADD, dl, VT, Op1, + DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2)); else Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2); break; - case Intrinsic::experimental_vector_reduce_fmul: - if (FMF.isFast()) - Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2); + case Intrinsic::experimental_vector_reduce_v2_fmul: + if (FMF.allowReassoc()) + Res = DAG.getNode(ISD::FMUL, dl, VT, Op1, + DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2)); else Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2); break; Index: llvm/trunk/lib/IR/AutoUpgrade.cpp =================================================================== --- llvm/trunk/lib/IR/AutoUpgrade.cpp +++ llvm/trunk/lib/IR/AutoUpgrade.cpp @@ -602,6 +602,26 @@ } break; } + case 'e': { + SmallVector Groups; + Regex R("^experimental.vector.reduce.([a-z]+)\\.[fi][0-9]+"); + if (R.match(Name, &Groups)) { + Intrinsic::ID ID = Intrinsic::not_intrinsic; + if (Groups[1] == "fadd") + ID = Intrinsic::experimental_vector_reduce_v2_fadd; + if (Groups[1] == "fmul") + ID = Intrinsic::experimental_vector_reduce_v2_fmul; + + if (ID != Intrinsic::not_intrinsic) { + rename(F); + auto Args = F->getFunctionType()->params(); + Type *Tys[] = {F->getFunctionType()->getReturnType(), Args[1]}; + NewFn = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + return true; + } + } + break; + } case 'i': case 'l': { bool IsLifetimeStart = Name.startswith("lifetime.start"); @@ -3467,7 +3487,28 @@ DefaultCase(); return; } - + case Intrinsic::experimental_vector_reduce_v2_fmul: { + SmallVector Args; + if (CI->isFast()) + Args.push_back(ConstantFP::get(CI->getOperand(0)->getType(), 1.0)); + else + Args.push_back(CI->getOperand(0)); + Args.push_back(CI->getOperand(1)); + NewCall = Builder.CreateCall(NewFn, Args); + cast(NewCall)->copyFastMathFlags(CI); + break; + } + case Intrinsic::experimental_vector_reduce_v2_fadd: { + SmallVector Args; + if (CI->isFast()) + Args.push_back(Constant::getNullValue(CI->getOperand(0)->getType())); + else + Args.push_back(CI->getOperand(0)); + Args.push_back(CI->getOperand(1)); + NewCall = Builder.CreateCall(NewFn, Args); + cast(NewCall)->copyFastMathFlags(CI); + break; + } case Intrinsic::arm_neon_vld1: case Intrinsic::arm_neon_vld2: case Intrinsic::arm_neon_vld3: Index: llvm/trunk/lib/IR/IRBuilder.cpp =================================================================== --- llvm/trunk/lib/IR/IRBuilder.cpp +++ llvm/trunk/lib/IR/IRBuilder.cpp @@ -323,7 +323,7 @@ Value *Ops[] = {Acc, Src}; Type *Tys[] = {Acc->getType(), Src->getType()}; auto Decl = Intrinsic::getDeclaration( - M, Intrinsic::experimental_vector_reduce_fadd, Tys); + M, Intrinsic::experimental_vector_reduce_v2_fadd, Tys); return createCallHelper(Decl, Ops, this); } @@ -332,7 +332,7 @@ Value *Ops[] = {Acc, Src}; Type *Tys[] = {Acc->getType(), Src->getType()}; auto Decl = Intrinsic::getDeclaration( - M, Intrinsic::experimental_vector_reduce_fmul, Tys); + M, Intrinsic::experimental_vector_reduce_v2_fmul, Tys); return createCallHelper(Decl, Ops, this); } Index: llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp +++ llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp @@ -801,13 +801,9 @@ ArrayRef RedOps) { assert(isa(Src->getType()) && "Type must be a vector"); - Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType()); std::function BuildFunc; using RD = RecurrenceDescriptor; RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid; - // TODO: Support creating ordered reductions. - FastMathFlags FMFFast; - FMFFast.setFast(); switch (Opcode) { case Instruction::Add: @@ -827,15 +823,15 @@ break; case Instruction::FAdd: BuildFunc = [&]() { - auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src); - cast(Rdx)->setFastMathFlags(FMFFast); + auto Rdx = Builder.CreateFAddReduce( + Constant::getNullValue(Src->getType()->getVectorElementType()), Src); return Rdx; }; break; case Instruction::FMul: BuildFunc = [&]() { - auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src); - cast(Rdx)->setFastMathFlags(FMFFast); + Type *Ty = Src->getType()->getVectorElementType(); + auto Rdx = Builder.CreateFMulReduce(ConstantFP::get(Ty, 1.0), Src); return Rdx; }; break; Index: llvm/trunk/test/Assembler/invalid-vecreduce.ll =================================================================== --- llvm/trunk/test/Assembler/invalid-vecreduce.ll +++ llvm/trunk/test/Assembler/invalid-vecreduce.ll @@ -1,34 +1,34 @@ ; RUN: not opt -S < %s 2>&1 | FileCheck %s ; CHECK: Intrinsic has incorrect argument type! -; CHECK-NEXT: float (double, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64 +; CHECK-NEXT: float (double, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64 define float @fadd_invalid_scalar_res(double %acc, <2 x double> %in) { - %res = call float @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in) + %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64(double %acc, <2 x double> %in) ret float %res } ; CHECK: Intrinsic has incorrect argument type! -; CHECK-NEXT: double (float, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64 +; CHECK-NEXT: double (float, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64 define double @fadd_invalid_scalar_start(float %acc, <2 x double> %in) { - %res = call double @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in) + %res = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64(float %acc, <2 x double> %in) ret double %res } ; CHECK: Intrinsic has incorrect argument type! -; CHECK-NEXT: <2 x double> (double, <2 x double>)* @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64 +; CHECK-NEXT: <2 x double> (double, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64 define <2 x double> @fadd_invalid_vector_res(double %acc, <2 x double> %in) { - %res = call <2 x double> @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in) + %res = call <2 x double> @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in) ret <2 x double> %res } ; CHECK: Intrinsic has incorrect argument type! -; CHECK-NEXT: double (<2 x double>, <2 x double>)* @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64 +; CHECK-NEXT: double (<2 x double>, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64 define double @fadd_invalid_vector_start(<2 x double> %in, <2 x double> %acc) { - %res = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in) + %res = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in) ret double %res } -declare float @llvm.experimental.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in) -declare double @llvm.experimental.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in) -declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in) -declare <2 x double> @llvm.experimental.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64(double %acc, <2 x double> %in) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64(float %acc, <2 x double> %in) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in) +declare <2 x double> @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in) Index: llvm/trunk/test/Bitcode/upgrade-vecreduce-intrinsics.ll =================================================================== --- llvm/trunk/test/Bitcode/upgrade-vecreduce-intrinsics.ll +++ llvm/trunk/test/Bitcode/upgrade-vecreduce-intrinsics.ll @@ -0,0 +1,64 @@ +; RUN: opt -S < %s | FileCheck %s +; RUN: llvm-dis < %s.bc | FileCheck %s + +define float @fadd_acc(<4 x float> %in, float %acc) { +; CHECK-LABEL: @fadd_acc +; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %acc, <4 x float> %in) + %res = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %in) + ret float %res +} + +define float @fadd_undef(<4 x float> %in) { +; CHECK-LABEL: @fadd_undef +; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %in) + %res = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %in) + ret float %res +} + +define float @fadd_fast_acc(<4 x float> %in, float %acc) { +; CHECK-LABEL: @fadd_fast_acc +; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %in) + %res = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %in) + ret float %res +} + +define float @fadd_fast_undef(<4 x float> %in) { +; CHECK-LABEL: @fadd_fast_undef +; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %in) + %res = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %in) + ret float %res +} + +define float @fmul_acc(<4 x float> %in, float %acc) { +; CHECK-LABEL: @fmul_acc +; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %acc, <4 x float> %in) + %res = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %in) + ret float %res +} + +define float @fmul_undef(<4 x float> %in) { +; CHECK-LABEL: @fmul_undef +; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %in) + %res = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %in) + ret float %res +} + +define float @fmul_fast_acc(<4 x float> %in, float %acc) { +; CHECK-LABEL: @fmul_fast_acc +; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %in) + %res = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %in) + ret float %res +} + +define float @fmul_fast_undef(<4 x float> %in) { +; CHECK-LABEL: @fmul_fast_undef +; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %in) + %res = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %in) + ret float %res +} + +declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +; CHECK: declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) + +declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>) +; CHECK: declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>) Index: llvm/trunk/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll +++ llvm/trunk/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll @@ -1,20 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK -declare half @llvm.experimental.vector.reduce.fadd.f16.v1f16(half, <1 x half>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v1f32(float, <1 x float>) -declare double @llvm.experimental.vector.reduce.fadd.f64.v1f64(double, <1 x double>) -declare fp128 @llvm.experimental.vector.reduce.fadd.f128.v1f128(fp128, <1 x fp128>) - -declare float @llvm.experimental.vector.reduce.fadd.f32.v3f32(float, <3 x float>) -declare fp128 @llvm.experimental.vector.reduce.fadd.f128.v2f128(fp128, <2 x fp128>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float, <16 x float>) +declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half, <1 x half>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float, <1 x float>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double, <1 x double>) +declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128, <1 x fp128>) + +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float, <3 x float>) +declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128, <2 x fp128>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>) define half @test_v1f16(<1 x half> %a) nounwind { ; CHECK-LABEL: test_v1f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call fast nnan half @llvm.experimental.vector.reduce.fadd.f16.v1f16(half 0.0, <1 x half> %a) + %b = call fast nnan half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half 0.0, <1 x half> %a) ret half %b } @@ -24,7 +24,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: ret - %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v1f32(float 0.0, <1 x float> %a) + %b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float 0.0, <1 x float> %a) ret float %b } @@ -32,7 +32,7 @@ ; CHECK-LABEL: test_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call fast nnan double @llvm.experimental.vector.reduce.fadd.f64.v1f64(double 0.0, <1 x double> %a) + %b = call fast nnan double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double 0.0, <1 x double> %a) ret double %b } @@ -40,7 +40,7 @@ ; CHECK-LABEL: test_v1f128: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call fast nnan fp128 @llvm.experimental.vector.reduce.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a) + %b = call fast nnan fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a) ret fp128 %b } @@ -53,7 +53,7 @@ ; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret - %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v3f32(float 0.0, <3 x float> %a) + %b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float 0.0, <3 x float> %a) ret float %b } @@ -64,7 +64,7 @@ ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret - %b = call fast nnan fp128 @llvm.experimental.vector.reduce.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a) + %b = call fast nnan fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a) ret fp128 %b } @@ -78,6 +78,6 @@ ; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret - %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a) + %b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a) ret float %b } Index: llvm/trunk/test/CodeGen/AArch64/vecreduce-fadd.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/vecreduce-fadd.ll +++ llvm/trunk/test/CodeGen/AArch64/vecreduce-fadd.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: add_HalfS: ; CHECK: faddp s0, v0.2s ; CHECK-NEXT: ret - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %bin.rdx) + %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %bin.rdx) ret float %r } @@ -23,7 +23,7 @@ ; CHECKNOFP16-NOT: fadd h{{[0-9]+}} ; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h ; CHECKNOFP16: ret - %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v4f16(half undef, <4 x half> %bin.rdx) + %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half 0.0, <4 x half> %bin.rdx) ret half %r } @@ -45,7 +45,7 @@ ; CHECKNOFP16-NOT: fadd h{{[0-9]+}} ; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h ; CHECKNOFP16: ret - %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v8f16(half undef, <8 x half> %bin.rdx) + %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half 0.0, <8 x half> %bin.rdx) ret half %r } @@ -55,7 +55,7 @@ ; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %bin.rdx) + %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %bin.rdx) ret float %r } @@ -63,7 +63,7 @@ ; CHECK-LABEL: add_D: ; CHECK: faddp d0, v0.2d ; CHECK-NEXT: ret - %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %bin.rdx) + %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %bin.rdx) ret double %r } @@ -84,7 +84,7 @@ ; CHECKNOFP16-NOT: fadd h{{[0-9]+}} ; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h ; CHECKNOFP16: ret - %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v16f16(half undef, <16 x half> %bin.rdx) + %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half 0.0, <16 x half> %bin.rdx) ret half %r } @@ -95,7 +95,7 @@ ; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %bin.rdx) + %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %bin.rdx) ret float %r } @@ -104,16 +104,16 @@ ; CHECK: fadd v0.2d, v0.2d, v1.2d ; CHECK-NEXT: faddp d0, v0.2d ; CHECK-NEXT: ret - %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %bin.rdx) + %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %bin.rdx) ret double %r } ; Function Attrs: nounwind readnone -declare half @llvm.experimental.vector.reduce.fadd.f16.v4f16(half, <4 x half>) -declare half @llvm.experimental.vector.reduce.fadd.f16.v8f16(half, <8 x half>) -declare half @llvm.experimental.vector.reduce.fadd.f16.v16f16(half, <16 x half>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>) -declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>) +declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half, <4 x half>) +declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half, <8 x half>) +declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half, <16 x half>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>) Index: llvm/trunk/test/CodeGen/Generic/expand-experimental-reductions.ll =================================================================== --- llvm/trunk/test/CodeGen/Generic/expand-experimental-reductions.ll +++ llvm/trunk/test/CodeGen/Generic/expand-experimental-reductions.ll @@ -7,8 +7,8 @@ declare i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64>) declare i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>) declare i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64>) declare i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64>) @@ -92,10 +92,11 @@ ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: ret float [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = fadd fast float 0.000000e+00, [[TMP0]] +; CHECK-NEXT: ret float [[TMP1]] ; entry: - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec) + %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %vec) ret float %r } @@ -107,10 +108,11 @@ ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: ret float [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = fadd fast float %accum, [[TMP0]] +; CHECK-NEXT: ret float [[TMP1]] ; entry: - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec) + %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %accum, <4 x float> %vec) ret float %r } @@ -128,7 +130,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec) + %r = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %vec) ret float %r } @@ -146,7 +148,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec) + %r = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %accum, <4 x float> %vec) ret float %r } @@ -158,10 +160,11 @@ ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: ret float [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = fmul fast float 1.000000e+00, [[TMP0]] +; CHECK-NEXT: ret float [[TMP1]] ; entry: - %r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec) + %r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %vec) ret float %r } @@ -173,10 +176,11 @@ ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: ret float [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = fmul fast float %accum, [[TMP0]] +; CHECK-NEXT: ret float [[TMP1]] ; entry: - %r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec) + %r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %accum, <4 x float> %vec) ret float %r } @@ -194,7 +198,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec) + %r = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %vec) ret float %r } @@ -212,7 +216,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec) + %r = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %accum, <4 x float> %vec) ret float %r } Index: llvm/trunk/test/CodeGen/X86/haddsub.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/haddsub.ll +++ llvm/trunk/test/CodeGen/X86/haddsub.ll @@ -1628,8 +1628,8 @@ ; Repeat tests from general reductions to verify output for hoppy targets: ; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971 -declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>) -declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>) define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) { ; SSE3-SLOW-LABEL: fadd_reduce_v8f32: @@ -1638,40 +1638,44 @@ ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE3-SLOW-NEXT: addps %xmm1, %xmm2 -; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE3-SLOW-NEXT: addss %xmm2, %xmm0 +; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE3-SLOW-NEXT: addss %xmm2, %xmm1 +; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: fadd_reduce_v8f32: ; SSE3-FAST: # %bb.0: ; SSE3-FAST-NEXT: addps %xmm2, %xmm1 -; SSE3-FAST-NEXT: movaps %xmm1, %xmm0 -; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE3-FAST-NEXT: addps %xmm1, %xmm0 -; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE3-FAST-NEXT: movaps %xmm1, %xmm2 +; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE3-FAST-NEXT: addps %xmm1, %xmm2 +; SSE3-FAST-NEXT: haddps %xmm2, %xmm2 +; SSE3-FAST-NEXT: addss %xmm2, %xmm0 ; SSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: fadd_reduce_v8f32: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: fadd_reduce_v8f32: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX-FAST-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper ; AVX-FAST-NEXT: retq - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1) + %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1) ret float %r } @@ -1679,35 +1683,38 @@ ; SSE3-SLOW-LABEL: fadd_reduce_v4f64: ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: addpd %xmm2, %xmm1 -; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: movapd %xmm1, %xmm2 +; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE3-SLOW-NEXT: addsd %xmm1, %xmm2 +; SSE3-SLOW-NEXT: addsd %xmm2, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: fadd_reduce_v4f64: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: movapd %xmm1, %xmm0 -; SSE3-FAST-NEXT: addpd %xmm2, %xmm0 -; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 +; SSE3-FAST-NEXT: addpd %xmm2, %xmm1 +; SSE3-FAST-NEXT: haddpd %xmm1, %xmm1 +; SSE3-FAST-NEXT: addsd %xmm1, %xmm0 ; SSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: fadd_reduce_v4f64: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: fadd_reduce_v4f64: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX-FAST-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper ; AVX-FAST-NEXT: retq - %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1) + %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1) ret double %r } Index: llvm/trunk/test/CodeGen/X86/vector-reduce-fadd-fast.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ llvm/trunk/test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -14,40 +14,46 @@ define float @test_v2f32(float %a0, <2 x float> %a1) { ; SSE2-LABEL: test_v2f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] -; SSE2-NEXT: addss %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3] +; SSE2-NEXT: addss %xmm1, %xmm2 +; SSE2-NEXT: addss %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: addss %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE41-NEXT: addss %xmm1, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v2f32: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: test_v2f32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v2f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v2f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 } @@ -57,9 +63,10 @@ ; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE2-NEXT: addps %xmm1, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT: addss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] +; SSE2-NEXT: addss %xmm2, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: @@ -67,41 +74,46 @@ ; SSE41-NEXT: movaps %xmm1, %xmm2 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE41-NEXT: addps %xmm1, %xmm2 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: addss %xmm2, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: addss %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v4f32: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: test_v4f32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX1-FAST-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v4f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 } @@ -112,9 +124,10 @@ ; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE2-NEXT: addps %xmm1, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT: addss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] +; SSE2-NEXT: addss %xmm2, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f32: @@ -123,53 +136,58 @@ ; SSE41-NEXT: movaps %xmm1, %xmm2 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE41-NEXT: addps %xmm1, %xmm2 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: addss %xmm2, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: addss %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v8f32: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: test_v8f32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-FAST-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v8f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1) ret float %1 } @@ -182,9 +200,10 @@ ; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE2-NEXT: addps %xmm1, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT: addss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] +; SSE2-NEXT: addss %xmm2, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f32: @@ -195,58 +214,63 @@ ; SSE41-NEXT: movaps %xmm1, %xmm2 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE41-NEXT: addps %xmm1, %xmm2 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: addss %xmm2, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: addss %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v16f32: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vaddps %ymm2, %ymm1, %ymm0 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-SLOW-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: test_v16f32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vaddps %ymm2, %ymm1, %ymm0 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v16f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0 -; AVX512-NEXT: vaddps %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float %a0, <16 x float> %a1) ret float %1 } @@ -291,7 +315,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %a0) ret float %1 } @@ -346,7 +370,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a0) ret float %1 } @@ -415,7 +439,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %a0) ret float %1 } @@ -493,7 +517,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a0) ret float %1 } @@ -538,7 +562,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %a0) ret float %1 } @@ -593,7 +617,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a0) ret float %1 } @@ -662,7 +686,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %a0) ret float %1 } @@ -740,7 +764,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float undef, <16 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a0) ret float %1 } @@ -751,34 +775,39 @@ define double @test_v2f64(double %a0, <2 x double> %a1) { ; SSE-LABEL: test_v2f64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: addsd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm2 +; SSE-NEXT: addsd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v2f64: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: test_v2f64: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v2f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 } @@ -786,46 +815,51 @@ ; SSE-LABEL: test_v4f64: ; SSE: # %bb.0: ; SSE-NEXT: addpd %xmm2, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: addsd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm2 +; SSE-NEXT: addsd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v4f64: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: test_v4f64: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-FAST-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v4f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX512-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1) ret double %1 } @@ -835,51 +869,56 @@ ; SSE-NEXT: addpd %xmm4, %xmm2 ; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: addpd %xmm2, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: addsd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm2 +; SSE-NEXT: addsd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v8f64: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm1, %ymm0 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: test_v8f64: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vaddpd %ymm2, %ymm1, %ymm0 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v8f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0 -; AVX512-NEXT: vaddpd %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double %a0, <8 x double> %a1) ret double %1 } @@ -893,58 +932,63 @@ ; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: addpd %xmm2, %xmm4 ; SSE-NEXT: addpd %xmm1, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT: addsd %xmm4, %xmm0 +; SSE-NEXT: movapd %xmm4, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: addsd %xmm4, %xmm1 +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v16f64: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vaddpd %ymm4, %ymm2, %ymm0 +; AVX1-SLOW-NEXT: vaddpd %ymm4, %ymm2, %ymm2 ; AVX1-SLOW-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVX1-SLOW-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: test_v16f64: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vaddpd %ymm4, %ymm2, %ymm0 +; AVX1-FAST-NEXT: vaddpd %ymm4, %ymm2, %ymm2 ; AVX1-FAST-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVX1-FAST-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v16f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm0 +; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double %a0, <16 x double> %a1) ret double %1 } @@ -983,7 +1027,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %a0) ret double %1 } @@ -1031,7 +1075,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %a0) ret double %1 } @@ -1086,7 +1130,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double 0.0, <8 x double> %a0) ret double %1 } @@ -1151,7 +1195,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double 0.0, <16 x double> %a0) ret double %1 } @@ -1190,7 +1234,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %a0) ret double %1 } @@ -1238,7 +1282,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %a0) ret double %1 } @@ -1293,7 +1337,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double undef, <8 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double 0.0, <8 x double> %a0) ret double %1 } @@ -1358,16 +1402,16 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double undef, <16 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double 0.0, <16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float, <16 x float>) - -declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double, <8 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double, <16 x double>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>) + +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double, <8 x double>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double, <16 x double>) Index: llvm/trunk/test/CodeGen/X86/vector-reduce-fadd.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-reduce-fadd.ll +++ llvm/trunk/test/CodeGen/X86/vector-reduce-fadd.ll @@ -39,7 +39,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 } @@ -90,7 +90,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 } @@ -176,7 +176,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1) ret float %1 } @@ -327,7 +327,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float %a0, <16 x float> %a1) ret float %1 } @@ -367,7 +367,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %a0) ret float %1 } @@ -422,7 +422,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a0) ret float %1 } @@ -512,7 +512,7 @@ ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %a0) ret float %1 } @@ -667,7 +667,7 @@ ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a0) ret float %1 } @@ -699,7 +699,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float undef, <2 x float> %a0) ret float %1 } @@ -746,7 +746,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %a0) ret float %1 } @@ -828,7 +828,7 @@ ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float undef, <8 x float> %a0) ret float %1 } @@ -975,7 +975,7 @@ ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float undef, <16 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float undef, <16 x float> %a0) ret float %1 } @@ -1004,7 +1004,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 } @@ -1042,7 +1042,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1) ret double %1 } @@ -1101,7 +1101,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double %a0, <8 x double> %a1) ret double %1 } @@ -1202,7 +1202,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double %a0, <16 x double> %a1) ret double %1 } @@ -1234,7 +1234,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %a0) ret double %1 } @@ -1275,7 +1275,7 @@ ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %a0) ret double %1 } @@ -1337,7 +1337,7 @@ ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double 0.0, <8 x double> %a0) ret double %1 } @@ -1440,7 +1440,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double 0.0, <16 x double> %a0) ret double %1 } @@ -1466,7 +1466,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double undef, <2 x double> %a0) ret double %1 } @@ -1501,7 +1501,7 @@ ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double undef, <4 x double> %a0) ret double %1 } @@ -1557,7 +1557,7 @@ ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double undef, <8 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double undef, <8 x double> %a0) ret double %1 } @@ -1654,16 +1654,16 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double undef, <16 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double undef, <16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float, <16 x float>) - -declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double, <8 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double, <16 x double>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>) +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>) + +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double, <8 x double>) +declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double, <16 x double>) Index: llvm/trunk/test/CodeGen/X86/vector-reduce-fmul-fast.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-reduce-fmul-fast.ll +++ llvm/trunk/test/CodeGen/X86/vector-reduce-fmul-fast.ll @@ -13,29 +13,33 @@ define float @test_v2f32(float %a0, <2 x float> %a1) { ; SSE2-LABEL: test_v2f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] -; SSE2-NEXT: mulss %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3] +; SSE2-NEXT: mulss %xmm1, %xmm2 +; SSE2-NEXT: mulss %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulss %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE41-NEXT: mulss %xmm1, %xmm2 +; SSE41-NEXT: mulss %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32: ; AVX: # %bb.0: -; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 } @@ -45,9 +49,10 @@ ; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE2-NEXT: mulps %xmm1, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT: mulss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] +; SSE2-NEXT: mulss %xmm2, %xmm1 +; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: @@ -55,26 +60,29 @@ ; SSE41-NEXT: movaps %xmm1, %xmm2 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE41-NEXT: mulps %xmm1, %xmm2 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: mulss %xmm2, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: mulss %xmm2, %xmm1 +; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 } @@ -85,9 +93,10 @@ ; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE2-NEXT: mulps %xmm1, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT: mulss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] +; SSE2-NEXT: mulss %xmm2, %xmm1 +; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f32: @@ -96,32 +105,35 @@ ; SSE41-NEXT: movaps %xmm1, %xmm2 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE41-NEXT: mulps %xmm1, %xmm2 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: mulss %xmm2, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: mulss %xmm2, %xmm1 +; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float %a0, <8 x float> %a1) ret float %1 } @@ -134,9 +146,10 @@ ; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE2-NEXT: mulps %xmm1, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT: mulss %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] +; SSE2-NEXT: mulss %xmm2, %xmm1 +; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f32: @@ -147,35 +160,38 @@ ; SSE41-NEXT: movaps %xmm1, %xmm2 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE41-NEXT: mulps %xmm1, %xmm2 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: mulss %xmm2, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: mulss %xmm2, %xmm1 +; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: ; AVX: # %bb.0: -; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0 -; AVX512-NEXT: vmulps %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float %a0, <16 x float> %a1) ret float %1 } @@ -209,7 +225,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float 1.0, <2 x float> %a0) ret float %1 } @@ -249,7 +265,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a0) ret float %1 } @@ -297,7 +313,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float 1.0, <8 x float> %a0) ret float %1 } @@ -352,7 +368,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 1.0, <16 x float> %a0) ret float %1 } @@ -386,7 +402,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float undef, <2 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float 1.0, <2 x float> %a0) ret float %1 } @@ -426,7 +442,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a0) ret float %1 } @@ -474,7 +490,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float undef, <8 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float 1.0, <8 x float> %a0) ret float %1 } @@ -529,7 +545,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float undef, <16 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 1.0, <16 x float> %a0) ret float %1 } @@ -540,23 +556,26 @@ define double @test_v2f64(double %a0, <2 x double> %a1) { ; SSE-LABEL: test_v2f64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: mulsd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm2 +; SSE-NEXT: mulsd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 } @@ -564,29 +583,32 @@ ; SSE-LABEL: test_v4f64: ; SSE: # %bb.0: ; SSE-NEXT: mulpd %xmm2, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: mulsd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm2 +; SSE-NEXT: mulsd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX-NEXT: vmulpd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX512-NEXT: vmulpd %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double %a0, <4 x double> %a1) ret double %1 } @@ -596,32 +618,35 @@ ; SSE-NEXT: mulpd %xmm4, %xmm2 ; SSE-NEXT: mulpd %xmm3, %xmm1 ; SSE-NEXT: mulpd %xmm2, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: mulsd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm2 +; SSE-NEXT: mulsd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: -; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0 -; AVX512-NEXT: vmulpd %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double %a0, <8 x double> %a1) ret double %1 } @@ -635,35 +660,38 @@ ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm2, %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT: mulsd %xmm4, %xmm0 +; SSE-NEXT: movapd %xmm4, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: mulsd %xmm4, %xmm1 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v16f64: ; AVX: # %bb.0: -; AVX-NEXT: vmulpd %ymm4, %ymm2, %ymm0 +; AVX-NEXT: vmulpd %ymm4, %ymm2, %ymm2 ; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vmulpd %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double %a0, <16 x double> %a1) ret double %1 } @@ -691,7 +719,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a0) ret double %1 } @@ -722,7 +750,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double 1.0, <4 x double> %a0) ret double %1 } @@ -758,7 +786,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double 1.0, <8 x double> %a0) ret double %1 } @@ -800,7 +828,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double 1.0, <16 x double> %a0) ret double %1 } @@ -828,7 +856,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double undef, <2 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a0) ret double %1 } @@ -859,7 +887,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double undef, <4 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double 1.0, <4 x double> %a0) ret double %1 } @@ -895,7 +923,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double undef, <8 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double 1.0, <8 x double> %a0) ret double %1 } @@ -937,16 +965,16 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double undef, <16 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double 1.0, <16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float, <8 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float, <16 x float>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float, <2 x float>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float, <8 x float>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float, <16 x float>) -declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double, <4 x double>) -declare double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double, <8 x double>) -declare double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double, <16 x double>) +declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double, <2 x double>) +declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double, <4 x double>) +declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double, <8 x double>) +declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double, <16 x double>) Index: llvm/trunk/test/CodeGen/X86/vector-reduce-fmul.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-reduce-fmul.ll +++ llvm/trunk/test/CodeGen/X86/vector-reduce-fmul.ll @@ -38,7 +38,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 } @@ -89,7 +89,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 } @@ -175,7 +175,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float %a0, <8 x float> %a1) ret float %1 } @@ -326,7 +326,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float %a0, <16 x float> %a1) ret float %1 } @@ -360,7 +360,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float 1.0, <2 x float> %a0) ret float %1 } @@ -407,7 +407,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a0) ret float %1 } @@ -489,7 +489,7 @@ ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float 1.0, <8 x float> %a0) ret float %1 } @@ -636,7 +636,7 @@ ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 1.0, <16 x float> %a0) ret float %1 } @@ -668,7 +668,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float undef, <2 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float undef, <2 x float> %a0) ret float %1 } @@ -715,7 +715,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %a0) ret float %1 } @@ -797,7 +797,7 @@ ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float undef, <8 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float undef, <8 x float> %a0) ret float %1 } @@ -944,7 +944,7 @@ ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float undef, <16 x float> %a0) + %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float undef, <16 x float> %a0) ret float %1 } @@ -973,7 +973,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 } @@ -1011,7 +1011,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double %a0, <4 x double> %a1) ret double %1 } @@ -1070,7 +1070,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double %a0, <8 x double> %a1) ret double %1 } @@ -1171,7 +1171,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double %a0, <16 x double> %a1) ret double %1 } @@ -1199,7 +1199,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a0) ret double %1 } @@ -1236,7 +1236,7 @@ ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double 1.0, <4 x double> %a0) ret double %1 } @@ -1294,7 +1294,7 @@ ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double 1.0, <8 x double> %a0) ret double %1 } @@ -1392,7 +1392,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double 1.0, <16 x double> %a0) ret double %1 } @@ -1418,7 +1418,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vmulsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double undef, <2 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double undef, <2 x double> %a0) ret double %1 } @@ -1453,7 +1453,7 @@ ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double undef, <4 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double undef, <4 x double> %a0) ret double %1 } @@ -1509,7 +1509,7 @@ ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double undef, <8 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double undef, <8 x double> %a0) ret double %1 } @@ -1606,16 +1606,16 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double undef, <16 x double> %a0) + %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double undef, <16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.fmul.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.v8f32(float, <8 x float>) -declare float @llvm.experimental.vector.reduce.fmul.f32.v16f32(float, <16 x float>) - -declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.fmul.f64.v4f64(double, <4 x double>) -declare double @llvm.experimental.vector.reduce.fmul.f64.v8f64(double, <8 x double>) -declare double @llvm.experimental.vector.reduce.fmul.f64.v16f64(double, <16 x double>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float, <2 x float>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float, <8 x float>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float, <16 x float>) + +declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double, <2 x double>) +declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double, <4 x double>) +declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double, <8 x double>) +declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double, <16 x double>)