Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -18510,6 +18510,16 @@ return Result; } +/// Horizontal vector math instructions may be slower than normal math with +/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch +/// implementation, and likely shuffle complexity of the alternate sequence. +static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool HasFastHOps = Subtarget.hasFastHorizontalOps(); + return !IsSingleSource || IsOptimizingSize || HasFastHOps; +} + /// 64-bit unsigned integer to double expansion. static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -18564,8 +18574,7 @@ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; - if (Subtarget.hasSSE3()) { - // FIXME: The 'haddpd' instruction may be slower than 'shuffle + addsd'. + if (shouldUseHorizontalOp(true, DAG, Subtarget)) { Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1}); @@ -19623,16 +19632,6 @@ return Op; } -/// Horizontal vector math instructions may be slower than normal math with -/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch -/// implementation, and likely shuffle complexity of the alternate sequence. -static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize(); - bool HasFastHOps = Subtarget.hasFastHorizontalOps(); - return !IsSingleSource || IsOptimizingSize || HasFastHOps; -} - /// Depending on uarch and/or optimizing for size, we might prefer to use a /// vector operation in place of the typical scalar operation. static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG, Index: llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -1841,7 +1841,8 @@ ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 ; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; X86-NEXT: vsubpd {{\.LCPI.*}}, %xmm1, %xmm1 -; X86-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; X86-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; X86-NEXT: vaddsd %xmm1, %xmm2, %xmm1 ; X86-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X86-NEXT: retl ; Index: llvm/test/CodeGen/X86/haddsub-3.ll =================================================================== --- llvm/test/CodeGen/X86/haddsub-3.ll +++ llvm/test/CodeGen/X86/haddsub-3.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3,SSSE3-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSSE3,SSSE3-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 define float @pr26491(<4 x float> %a0) { ; SSE2-LABEL: pr26491: @@ -58,37 +60,68 @@ ; SSE2-NEXT: addpd %xmm2, %xmm1 ; SSE2-NEXT: retq ; -; SSSE3-LABEL: PR41414: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq %rdi, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; SSSE3-NEXT: subpd {{.*}}(%rip), %xmm2 -; SSSE3-NEXT: haddpd %xmm2, %xmm2 -; SSSE3-NEXT: divpd %xmm2, %xmm1 -; SSSE3-NEXT: divpd %xmm2, %xmm0 -; SSSE3-NEXT: xorpd %xmm2, %xmm2 -; SSSE3-NEXT: addpd %xmm2, %xmm0 -; SSSE3-NEXT: addpd %xmm2, %xmm1 -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: PR41414: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: movq %rdi, %xmm2 +; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; SSSE3-SLOW-NEXT: subpd {{.*}}(%rip), %xmm2 +; SSSE3-SLOW-NEXT: movapd %xmm2, %xmm3 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm3 +; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm2 = xmm3[0,0] +; SSSE3-SLOW-NEXT: divpd %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: divpd %xmm2, %xmm0 +; SSSE3-SLOW-NEXT: xorpd %xmm2, %xmm2 +; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm0 +; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: PR41414: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: movq %rdi, %xmm2 +; SSSE3-FAST-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; SSSE3-FAST-NEXT: subpd {{.*}}(%rip), %xmm2 +; SSSE3-FAST-NEXT: haddpd %xmm2, %xmm2 +; SSSE3-FAST-NEXT: divpd %xmm2, %xmm1 +; SSSE3-FAST-NEXT: divpd %xmm2, %xmm0 +; SSSE3-FAST-NEXT: xorpd %xmm2, %xmm2 +; SSSE3-FAST-NEXT: addpd %xmm2, %xmm0 +; SSSE3-FAST-NEXT: addpd %xmm2, %xmm1 +; SSSE3-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: PR41414: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vmovq %rdi, %xmm1 +; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-SLOW-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm2, %xmm1 +; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-SLOW-NEXT: vdivpd %ymm1, %ymm0, %ymm0 +; AVX1-SLOW-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-SLOW-NEXT: retq ; -; AVX1-LABEL: PR41414: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq %rdi, %xmm1 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-FAST-LABEL: PR41414: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vmovq %rdi, %xmm1 +; AVX1-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-FAST-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-FAST-NEXT: vdivpd %ymm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: PR41414: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq %rdi, %xmm1 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX2-NEXT: vaddsd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-NEXT: vdivpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 Index: llvm/test/CodeGen/X86/haddsub-broadcast.ll =================================================================== --- llvm/test/CodeGen/X86/haddsub-broadcast.ll +++ llvm/test/CodeGen/X86/haddsub-broadcast.ll @@ -9,7 +9,8 @@ ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0 -; CHECK-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; CHECK-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ; CHECK-NEXT: retl %conv = uitofp i64 %x to double Index: llvm/test/CodeGen/X86/scalar-int-to-fp.ll =================================================================== --- llvm/test/CodeGen/X86/scalar-int-to-fp.ll +++ llvm/test/CodeGen/X86/scalar-int-to-fp.ll @@ -610,8 +610,9 @@ ; AVX512F_32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512F_32-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512F_32-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX512F_32-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX512F_32-NEXT: vmovlpd %xmm0, (%esp) +; AVX512F_32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512F_32-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; AVX512F_32-NEXT: vmovsd %xmm0, (%esp) ; AVX512F_32-NEXT: fldl (%esp) ; AVX512F_32-NEXT: movl %ebp, %esp ; AVX512F_32-NEXT: popl %ebp