Index: include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- include/llvm/Transforms/Utils/LoopUtils.h +++ include/llvm/Transforms/Utils/LoopUtils.h @@ -510,10 +510,11 @@ OptimizationRemarkEmitter *ORE = nullptr); /// Generates a vector reduction using shufflevectors to reduce the value. -Value *getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, - RecurrenceDescriptor::MinMaxRecurrenceKind - MinMaxKind = RecurrenceDescriptor::MRK_Invalid, - ArrayRef RedOps = None); +Value * +getShuffleReduction(IRBuilder<> &Builder, Value *Acc, Value *Src, unsigned Op, + RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = + RecurrenceDescriptor::MRK_Invalid, + ArrayRef RedOps = None); /// Create a target reduction of the given vector. The reduction operation /// is described by the \p Opcode parameter. min/max reductions require Index: lib/CodeGen/ExpandReductions.cpp =================================================================== --- lib/CodeGen/ExpandReductions.cpp +++ lib/CodeGen/ExpandReductions.cpp @@ -78,13 +78,14 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { bool Changed = false; - SmallVector Worklist; + SmallVector Worklist; for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) if (auto II = dyn_cast(&*I)) Worklist.push_back(II); for (auto *II : Worklist) { IRBuilder<> Builder(II); + Value *Acc = nullptr; Value *Vec = nullptr; auto ID = II->getIntrinsicID(); auto MRK = RecurrenceDescriptor::MRK_Invalid; @@ -97,6 +98,7 @@ // without native support. if (!II->getFastMathFlags().isFast()) continue; + Acc = II->getArgOperand(0); Vec = II->getArgOperand(1); break; case Intrinsic::experimental_vector_reduce_add: @@ -118,7 +120,7 @@ } if (!TTI->shouldExpandReduction(II)) continue; - auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); + Value *Rdx = getShuffleReduction(Builder, Acc, Vec, getOpcode(ID), MRK); II->replaceAllUsesWith(Rdx); II->eraseFromParent(); Changed = true; Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -1528,7 +1528,8 @@ // Helper to generate a log2 shuffle reduction. Value * -llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, +llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Acc, Value *Src, + unsigned Op, RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind, ArrayRef RedOps) { unsigned VF = Src->getType()->getVectorNumElements(); @@ -1537,6 +1538,23 @@ // round. assert(isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!"); + + auto CreateReductionOp = [&](Value *X, Value *Y) { + Value *Result; + if (Op != Instruction::ICmp && Op != Instruction::FCmp) { + // Floating point operations had to be 'fast' to enable the reduction. + Result = addFastMathFlag( + Builder.CreateBinOp((Instruction::BinaryOps)Op, X, Y, "bin.rdx")); + } else { + assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid && + "Invalid min/max"); + Result = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, X, Y); + } + if (!RedOps.empty()) + propagateIRFlags(Result, RedOps); + return Result; + }; + Value *TmpVec = Src; SmallVector ShuffleMask(VF, nullptr); for (unsigned i = VF; i != 1; i >>= 1) { @@ -1552,21 +1570,14 @@ TmpVec, UndefValue::get(TmpVec->getType()), ConstantVector::get(ShuffleMask), "rdx.shuf"); - if (Op != Instruction::ICmp && Op != Instruction::FCmp) { - // Floating point operations had to be 'fast' to enable the reduction. - TmpVec = addFastMathFlag(Builder.CreateBinOp((Instruction::BinaryOps)Op, - TmpVec, Shuf, "bin.rdx")); - } else { - assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid && - "Invalid min/max"); - TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, TmpVec, - Shuf); - } - if (!RedOps.empty()) - propagateIRFlags(TmpVec, RedOps); + TmpVec = CreateReductionOp(TmpVec, Shuf); } + // The result is in the first element of the vector. - return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + Value *Result = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + if (Acc) + Result = CreateReductionOp(Acc, Result); + return Result; } /// Create a simple vector reduction specified by an opcode and some @@ -1643,7 +1654,7 @@ } if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags)) return BuildFunc(); - return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps); + return getShuffleReduction(Builder, nullptr, Src, Opcode, MinMaxKind, RedOps); } /// Create a vector reduction using a given recurrence descriptor. Index: test/CodeGen/Generic/expand-experimental-reductions.ll =================================================================== --- test/CodeGen/Generic/expand-experimental-reductions.ll +++ test/CodeGen/Generic/expand-experimental-reductions.ll @@ -92,7 +92,8 @@ ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: ret float [[TMP0]] +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd fast float undef, [[TMP0]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec) @@ -107,7 +108,8 @@ ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: ret float [[TMP0]] +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd fast float [[ACCUM:%.*]], [[TMP0]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec) @@ -144,7 +146,8 @@ ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: ret float [[TMP0]] +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul fast float undef, [[TMP0]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec) @@ -159,7 +162,8 @@ ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: ret float [[TMP0]] +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul fast float [[ACCUM:%.*]], [[TMP0]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec) Index: test/CodeGen/X86/vector-reduce-fadd-fast.ll =================================================================== --- test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -13,25 +13,28 @@ define float @test_v2f32(float %a0, <2 x float> %a1) { ; SSE2-LABEL: test_v2f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] -; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3] +; SSE2-NEXT: addps %xmm1, %xmm2 +; SSE2-NEXT: addss %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: ; SSE41-NEXT: haddps %xmm1, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32: ; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm1, %xmm1, %xmm0 +; AVX-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vhaddps %xmm1, %xmm1, %xmm0 +; AVX512-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 @@ -43,31 +46,35 @@ ; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: addps %xmm1, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT: addps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] +; SSE2-NEXT: addps %xmm2, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; SSE41-NEXT: addps %xmm1, %xmm0 -; SSE41-NEXT: haddps %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] +; SSE41-NEXT: addps %xmm1, %xmm2 +; SSE41-NEXT: haddps %xmm2, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 @@ -80,39 +87,41 @@ ; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: addps %xmm1, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT: addps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] +; SSE2-NEXT: addps %xmm2, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: ; SSE41-NEXT: addps %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; SSE41-NEXT: addps %xmm1, %xmm0 -; SSE41-NEXT: haddps %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] +; SSE41-NEXT: addps %xmm1, %xmm2 +; SSE41-NEXT: haddps %xmm2, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vhaddps %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX512-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vhaddps %ymm0, %ymm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vhaddps %ymm1, %ymm1, %ymm1 +; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1) @@ -128,9 +137,10 @@ ; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: addps %xmm1, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT: addps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] +; SSE2-NEXT: addps %xmm2, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f32: @@ -138,35 +148,36 @@ ; SSE41-NEXT: addps %xmm4, %xmm2 ; SSE41-NEXT: addps %xmm3, %xmm1 ; SSE41-NEXT: addps %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; SSE41-NEXT: addps %xmm1, %xmm0 -; SSE41-NEXT: haddps %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] +; SSE41-NEXT: addps %xmm1, %xmm2 +; SSE41-NEXT: haddps %xmm2, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: ; AVX: # %bb.0: -; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vhaddps %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0 -; AVX512-NEXT: vaddps %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1) @@ -353,22 +364,26 @@ ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] ; SSE2-NEXT: addps %xmm0, %xmm1 +; SSE2-NEXT: addss %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32_undef: ; SSE41: # %bb.0: ; SSE41-NEXT: haddps %xmm0, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32_undef: ; AVX: # %bb.0: ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32_undef: ; AVX512: # %bb.0: ; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0) ret float %1 @@ -383,6 +398,7 @@ ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] ; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32_undef: @@ -391,6 +407,7 @@ ; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: haddps %xmm1, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -399,6 +416,7 @@ ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32_undef: @@ -406,6 +424,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0) ret float %1 @@ -421,6 +440,7 @@ ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] ; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f32_undef: @@ -430,6 +450,7 @@ ; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: haddps %xmm1, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -440,7 +461,7 @@ ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -451,7 +472,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vhaddps %ymm0, %ymm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0) @@ -470,6 +491,7 @@ ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] ; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f32_undef: @@ -481,6 +503,7 @@ ; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: haddps %xmm1, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -492,7 +515,7 @@ ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -506,7 +529,7 @@ ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0) @@ -520,25 +543,28 @@ define double @test_v2f64(double %a0, <2 x double> %a1) { ; SSE2-LABEL: test_v2f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] +; SSE2-NEXT: addpd %xmm1, %xmm2 +; SSE2-NEXT: addsd %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f64: ; SSE41: # %bb.0: ; SSE41-NEXT: haddpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: addsd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %xmm1, %xmm1, %xmm0 +; AVX-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vhaddpd %xmm1, %xmm1, %xmm0 +; AVX512-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 @@ -548,33 +574,34 @@ ; SSE2-LABEL: test_v4f64: ; SSE2: # %bb.0: ; SSE2-NEXT: addpd %xmm2, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] +; SSE2-NEXT: addpd %xmm1, %xmm2 +; SSE2-NEXT: addsd %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f64: ; SSE41: # %bb.0: ; SSE41-NEXT: addpd %xmm2, %xmm1 ; SSE41-NEXT: haddpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: addsd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vhaddpd %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX512-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vhaddpd %ymm1, %ymm1, %ymm1 +; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1) @@ -587,9 +614,10 @@ ; SSE2-NEXT: addpd %xmm4, %xmm2 ; SSE2-NEXT: addpd %xmm3, %xmm1 ; SSE2-NEXT: addpd %xmm2, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] +; SSE2-NEXT: addpd %xmm1, %xmm2 +; SSE2-NEXT: addsd %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f64: @@ -598,28 +626,28 @@ ; SSE41-NEXT: addpd %xmm3, %xmm1 ; SSE41-NEXT: addpd %xmm2, %xmm1 ; SSE41-NEXT: haddpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: addsd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: -; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vhaddpd %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0 -; AVX512-NEXT: vaddpd %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1) @@ -636,9 +664,10 @@ ; SSE2-NEXT: addpd {{[0-9]+}}(%rsp), %xmm4 ; SSE2-NEXT: addpd %xmm2, %xmm4 ; SSE2-NEXT: addpd %xmm1, %xmm4 -; SSE2-NEXT: movapd %xmm4, %xmm0 -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm4[1],xmm0[1] -; SSE2-NEXT: addpd %xmm4, %xmm0 +; SSE2-NEXT: movapd %xmm4, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; SSE2-NEXT: addpd %xmm4, %xmm1 +; SSE2-NEXT: addsd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f64: @@ -651,31 +680,31 @@ ; SSE41-NEXT: addpd %xmm2, %xmm4 ; SSE41-NEXT: addpd %xmm1, %xmm4 ; SSE41-NEXT: haddpd %xmm4, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: addsd %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f64: ; AVX: # %bb.0: -; AVX-NEXT: vaddpd %ymm4, %ymm2, %ymm0 +; AVX-NEXT: vaddpd %ymm4, %ymm2, %ymm2 ; AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vhaddpd %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1) @@ -861,22 +890,26 @@ ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: addpd %xmm0, %xmm1 +; SSE2-NEXT: addsd %xmm0, %xmm1 ; SSE2-NEXT: movapd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f64_undef: ; SSE41: # %bb.0: ; SSE41-NEXT: haddpd %xmm0, %xmm0 +; SSE41-NEXT: addsd %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f64_undef: ; AVX: # %bb.0: ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vaddsd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64_undef: ; AVX512: # %bb.0: ; AVX512-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vaddsd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0) ret double %1 @@ -889,6 +922,7 @@ ; SSE2-NEXT: movapd %xmm0, %xmm1 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: addpd %xmm0, %xmm1 +; SSE2-NEXT: addsd %xmm0, %xmm1 ; SSE2-NEXT: movapd %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -896,6 +930,7 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: addpd %xmm1, %xmm0 ; SSE41-NEXT: haddpd %xmm0, %xmm0 +; SSE41-NEXT: addsd %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f64_undef: @@ -903,7 +938,7 @@ ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vaddsd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -912,7 +947,7 @@ ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vaddsd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0) @@ -928,6 +963,7 @@ ; SSE2-NEXT: movapd %xmm0, %xmm1 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: addpd %xmm0, %xmm1 +; SSE2-NEXT: addsd %xmm0, %xmm1 ; SSE2-NEXT: movapd %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -937,6 +973,7 @@ ; SSE41-NEXT: addpd %xmm2, %xmm0 ; SSE41-NEXT: addpd %xmm1, %xmm0 ; SSE41-NEXT: haddpd %xmm0, %xmm0 +; SSE41-NEXT: addsd %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f64_undef: @@ -945,7 +982,7 @@ ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vaddsd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -957,7 +994,7 @@ ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vaddsd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0) @@ -977,6 +1014,7 @@ ; SSE2-NEXT: movapd %xmm1, %xmm0 ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: addsd %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f64_undef: @@ -989,6 +1027,7 @@ ; SSE41-NEXT: addpd %xmm3, %xmm1 ; SSE41-NEXT: addpd %xmm0, %xmm1 ; SSE41-NEXT: haddpd %xmm1, %xmm1 +; SSE41-NEXT: addsd %xmm0, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -1000,7 +1039,7 @@ ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vaddsd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1013,7 +1052,7 @@ ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vaddsd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0) Index: test/CodeGen/X86/vector-reduce-fmul-fast.ll =================================================================== --- test/CodeGen/X86/vector-reduce-fmul-fast.ll +++ test/CodeGen/X86/vector-reduce-fmul-fast.ll @@ -13,27 +13,31 @@ define float @test_v2f32(float %a0, <2 x float> %a1) { ; SSE2-LABEL: test_v2f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] -; SSE2-NEXT: mulps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3] +; SSE2-NEXT: mulps %xmm1, %xmm2 +; SSE2-NEXT: mulss %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE41-NEXT: mulps %xmm1, %xmm2 +; SSE41-NEXT: mulss %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32: ; AVX: # %bb.0: -; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 @@ -45,9 +49,10 @@ ; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: mulps %xmm1, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT: mulps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] +; SSE2-NEXT: mulps %xmm2, %xmm1 +; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: @@ -55,24 +60,27 @@ ; SSE41-NEXT: movaps %xmm1, %xmm2 ; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: mulps %xmm1, %xmm2 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: mulps %xmm2, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: mulps %xmm2, %xmm1 +; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 @@ -85,9 +93,10 @@ ; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: mulps %xmm1, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT: mulps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] +; SSE2-NEXT: mulps %xmm2, %xmm1 +; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f32: @@ -96,31 +105,32 @@ ; SSE41-NEXT: movaps %xmm1, %xmm2 ; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: mulps %xmm1, %xmm2 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: mulps %xmm2, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: mulps %xmm2, %xmm1 +; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX512-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmulps %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vmulps %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512-NEXT: vmulps %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1) @@ -136,9 +146,10 @@ ; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: mulps %xmm1, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT: mulps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3] +; SSE2-NEXT: mulps %xmm2, %xmm1 +; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f32: @@ -149,34 +160,35 @@ ; SSE41-NEXT: movaps %xmm1, %xmm2 ; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: mulps %xmm1, %xmm2 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: mulps %xmm2, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: mulps %xmm2, %xmm1 +; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: ; AVX: # %bb.0: -; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0 -; AVX512-NEXT: vmulps %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1) @@ -374,6 +386,7 @@ ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] ; SSE2-NEXT: mulps %xmm0, %xmm1 +; SSE2-NEXT: mulss %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -381,18 +394,21 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: mulps %xmm1, %xmm0 +; SSE41-NEXT: mulss %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32_undef: ; AVX: # %bb.0: ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulss %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32_undef: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmulss %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0) ret float %1 @@ -407,6 +423,7 @@ ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] ; SSE2-NEXT: mulps %xmm1, %xmm0 +; SSE2-NEXT: mulss %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32_undef: @@ -416,6 +433,7 @@ ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE41-NEXT: mulps %xmm0, %xmm1 +; SSE41-NEXT: mulss %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -425,6 +443,7 @@ ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulss %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32_undef: @@ -433,6 +452,7 @@ ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmulss %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0) ret float %1 @@ -448,6 +468,7 @@ ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] ; SSE2-NEXT: mulps %xmm1, %xmm0 +; SSE2-NEXT: mulss %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v8f32_undef: @@ -458,6 +479,7 @@ ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE41-NEXT: mulps %xmm0, %xmm1 +; SSE41-NEXT: mulss %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -469,7 +491,7 @@ ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vmulss %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -481,7 +503,7 @@ ; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vmulss %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0) @@ -500,6 +522,7 @@ ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] ; SSE2-NEXT: mulps %xmm1, %xmm0 +; SSE2-NEXT: mulss %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f32_undef: @@ -512,6 +535,7 @@ ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE41-NEXT: mulps %xmm0, %xmm1 +; SSE41-NEXT: mulss %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -524,7 +548,7 @@ ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vmulss %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -538,7 +562,7 @@ ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vmulss %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0) @@ -552,21 +576,24 @@ define double @test_v2f64(double %a0, <2 x double> %a1) { ; SSE-LABEL: test_v2f64: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: mulsd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX-NEXT: vmulpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX512-NEXT: vmulpd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 @@ -576,28 +603,29 @@ ; SSE-LABEL: test_v4f64: ; SSE: # %bb.0: ; SSE-NEXT: mulpd %xmm2, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: mulsd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX-NEXT: vmulpd %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX512-NEXT: vmulpd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1) @@ -610,31 +638,32 @@ ; SSE-NEXT: mulpd %xmm4, %xmm2 ; SSE-NEXT: mulpd %xmm3, %xmm1 ; SSE-NEXT: mulpd %xmm2, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: mulsd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: -; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v8f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0 -; AVX512-NEXT: vmulpd %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1) @@ -651,34 +680,35 @@ ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm2, %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm0 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm4[1],xmm0[1] -; SSE-NEXT: mulpd %xmm4, %xmm0 +; SSE-NEXT: movapd %xmm4, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; SSE-NEXT: mulpd %xmm4, %xmm1 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v16f64: ; AVX: # %bb.0: -; AVX-NEXT: vmulpd %ymm4, %ymm2, %ymm0 +; AVX-NEXT: vmulpd %ymm4, %ymm2, %ymm2 ; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vmulpd %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v16f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1) @@ -838,6 +868,7 @@ ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: mulpd %xmm0, %xmm1 +; SSE-NEXT: mulsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -845,12 +876,14 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulsd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64_undef: ; AVX512: # %bb.0: ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmulsd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0) ret double %1 @@ -863,6 +896,7 @@ ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: mulpd %xmm0, %xmm1 +; SSE-NEXT: mulsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -872,7 +906,7 @@ ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vmulsd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -882,7 +916,7 @@ ; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vmulsd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0) @@ -898,6 +932,7 @@ ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: mulpd %xmm0, %xmm1 +; SSE-NEXT: mulsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -908,7 +943,7 @@ ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vmulsd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -920,7 +955,7 @@ ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vmulsd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0) @@ -940,6 +975,7 @@ ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: mulsd %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v16f64_undef: @@ -951,7 +987,7 @@ ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vmulsd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -964,7 +1000,7 @@ ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vmulsd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0)