Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -730,6 +730,7 @@ setOperationAction(ISD::FNEG, MVT::v4f32, Custom); setOperationAction(ISD::FABS, MVT::v4f32, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); @@ -765,6 +766,7 @@ setOperationAction(ISD::MUL, MVT::v8i16, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); setOperationAction(ISD::FABS, MVT::v2f64, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom); setOperationAction(ISD::SMAX, MVT::v8i16, Legal); setOperationAction(ISD::UMAX, MVT::v16i8, Legal); @@ -980,6 +982,7 @@ setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); + setOperationAction(ISD::FCOPYSIGN, VT, Custom); } // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted @@ -14651,31 +14654,39 @@ // At this point the operands and the result should have the same // type, and that won't be f80 since that is not custom lowered. bool IsF128 = (VT == MVT::f128); - assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) && + assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || + VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || + VT == MVT::v8f32) && "Unexpected type in LowerFCOPYSIGN"); + MVT EltVT = VT.getScalarType(); const fltSemantics &Sem = - VT == MVT::f64 ? APFloat::IEEEdouble : - (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle); - const unsigned SizeInBits = VT.getSizeInBits(); + EltVT == MVT::f64 ? APFloat::IEEEdouble + : (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle); - // Perform all logic operations as 16-byte vectors because there are no + // Perform all scalar logic operations as 16-byte vectors because there are no // scalar FP logic instructions in SSE. - MVT LogicVT = - (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32); + // TODO: This isn't necessary. If we used scalar types, we might avoid some + // unnecessary splats, but we might miss load folding opportunities. Should + // this decision be based on OptimizeForSize? + bool IsFakeVector = !VT.isVector() && !IsF128; + MVT LogicVT = VT; + if (IsFakeVector) + LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; + + // The mask constants are automatically splatted for vector types. + unsigned EltSizeInBits = VT.getScalarSizeInBits(); SDValue SignMask = DAG.getConstantFP( - APFloat(Sem, APInt::getSignBit(SizeInBits)), dl, LogicVT); + APFloat(Sem, APInt::getSignBit(EltSizeInBits)), dl, LogicVT); + SDValue MagMask = DAG.getConstantFP( + APFloat(Sem, ~APInt::getSignBit(EltSizeInBits)), dl, LogicVT); // First, clear all bits but the sign bit from the second operand (sign). - if (!IsF128) + if (IsFakeVector) Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask); // Next, clear the sign bit from the first operand (magnitude). - // If it's a constant, we can clear it here. - SDValue MagMask = DAG.getConstantFP( - APFloat(Sem, ~APInt::getSignBit(SizeInBits)), dl, LogicVT); - // TODO: If we had general constant folding for FP logic ops, this check // wouldn't be necessary. SDValue MagBits; @@ -14685,16 +14696,15 @@ MagBits = DAG.getConstantFP(APF, dl, LogicVT); } else { // If the magnitude operand wasn't a constant, we need to AND out the sign. - if (!IsF128) + if (IsFakeVector) Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag); MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask); } // OR the magnitude value with the sign bit. SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit); - return IsF128 ? Or : - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or, - DAG.getIntPtrConstant(0, dl)); + return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or, + DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { Index: test/CodeGen/X86/vec-copysign.ll =================================================================== --- test/CodeGen/X86/vec-copysign.ll +++ test/CodeGen/X86/vec-copysign.ll @@ -1,286 +1,161 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-apple-macosx10.10.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-apple-macosx10.10.0 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=CHECK -; FIXME: These don't have to be scalarized. +; Assertions have been enhanced from utils/update_test_checks.py to show the constant pool values. +; Use a macosx triple to make sure the format of those constant strings is exact. + +; CHECK: [[SIGNMASK1:L.+]]: +; CHECK-NEXT: .long 2147483648 +; CHECK-NEXT: .long 2147483648 +; CHECK-NEXT: .long 2147483648 +; CHECK-NEXT: .long 2147483648 + +; CHECK: [[MAGMASK1:L.+]]: +; CHECK-NEXT: .long 2147483647 +; CHECK-NEXT: .long 2147483647 +; CHECK-NEXT: .long 2147483647 +; CHECK-NEXT: .long 2147483647 define <4 x float> @v4f32(<4 x float> %a, <4 x float> %b) nounwind { ; SSE2-LABEL: v4f32: ; SSE2: # BB#0: -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] -; SSE2-NEXT: movaps {{.*#+}} xmm3 = [-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00] -; SSE2-NEXT: andps %xmm3, %xmm2 -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1,2,3] -; SSE2-NEXT: movaps {{.*#+}} xmm5 -; SSE2-NEXT: andps %xmm5, %xmm4 -; SSE2-NEXT: orps %xmm2, %xmm4 -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] -; SSE2-NEXT: andps %xmm3, %xmm2 -; SSE2-NEXT: movaps %xmm0, %xmm6 -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1,2,3] -; SSE2-NEXT: andps %xmm5, %xmm6 -; SSE2-NEXT: orps %xmm2, %xmm6 -; SSE2-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE2-NEXT: movaps %xmm1, %xmm4 -; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: andps %xmm5, %xmm2 -; SSE2-NEXT: orps %xmm4, %xmm2 -; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE2-NEXT: andps %xmm3, %xmm1 -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE2-NEXT: andps %xmm5, %xmm0 +; SSE2-NEXT: andps [[SIGNMASK1]](%rip), %xmm1 +; SSE2-NEXT: andps [[MAGMASK1]](%rip), %xmm0 ; SSE2-NEXT: orps %xmm1, %xmm0 -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: v4f32: ; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} xmm2 = [-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00] -; AVX-NEXT: vandps %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vmovaps {{.*#+}} xmm4 -; AVX-NEXT: vandps %xmm4, %xmm0, %xmm5 -; AVX-NEXT: vorps %xmm3, %xmm5, %xmm3 -; AVX-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] -; AVX-NEXT: vandps %xmm2, %xmm5, %xmm5 -; AVX-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3] -; AVX-NEXT: vandps %xmm4, %xmm6, %xmm6 -; AVX-NEXT: vorps %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] -; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] -; AVX-NEXT: vandpd %xmm2, %xmm5, %xmm5 -; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] -; AVX-NEXT: vandpd %xmm4, %xmm6, %xmm6 -; AVX-NEXT: vorpd %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3] -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX-NEXT: vandps %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vandps [[SIGNMASK1]](%rip), %xmm1, %xmm1 +; AVX-NEXT: vandps [[MAGMASK1]](%rip), %xmm0, %xmm0 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] ; AVX-NEXT: retq ; %tmp = tail call <4 x float> @llvm.copysign.v4f32( <4 x float> %a, <4 x float> %b ) ret <4 x float> %tmp } +; SSE2: [[SIGNMASK2:L.+]]: +; SSE2-NEXT: .long 2147483648 +; SSE2-NEXT: .long 2147483648 +; SSE2-NEXT: .long 2147483648 +; SSE2-NEXT: .long 2147483648 + +; SSE2: [[MAGMASK2:L.+]]: +; SSE2-NEXT: .long 2147483647 +; SSE2-NEXT: .long 2147483647 +; SSE2-NEXT: .long 2147483647 +; SSE2-NEXT: .long 2147483647 + +; AVX: [[SIGNMASK2:L.+]]: +; AVX-NEXT: .long 2147483648 +; AVX-NEXT: .long 2147483648 +; AVX-NEXT: .long 2147483648 +; AVX-NEXT: .long 2147483648 +; AVX-NEXT: .long 2147483648 +; AVX-NEXT: .long 2147483648 +; AVX-NEXT: .long 2147483648 +; AVX-NEXT: .long 2147483648 + +; AVX: [[MAGMASK2:L.+]]: +; AVX-NEXT: .long 2147483647 +; AVX-NEXT: .long 2147483647 +; AVX-NEXT: .long 2147483647 +; AVX-NEXT: .long 2147483647 +; AVX-NEXT: .long 2147483647 +; AVX-NEXT: .long 2147483647 +; AVX-NEXT: .long 2147483647 +; AVX-NEXT: .long 2147483647 + define <8 x float> @v8f32(<8 x float> %a, <8 x float> %b) nounwind { ; SSE2-LABEL: v8f32: ; SSE2: # BB#0: -; SSE2-NEXT: movaps %xmm0, %xmm5 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-NEXT: movaps {{.*#+}} xmm8 = [-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00] -; SSE2-NEXT: andps %xmm8, %xmm0 -; SSE2-NEXT: movaps %xmm5, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1,2,3] -; SSE2-NEXT: movaps {{.*#+}} xmm6 -; SSE2-NEXT: andps %xmm6, %xmm7 -; SSE2-NEXT: orps %xmm0, %xmm7 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: andps %xmm8, %xmm0 -; SSE2-NEXT: movaps %xmm5, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1,2,3] -; SSE2-NEXT: andps %xmm6, %xmm4 -; SSE2-NEXT: orps %xmm0, %xmm4 -; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE2-NEXT: movaps %xmm2, %xmm7 -; SSE2-NEXT: andps %xmm8, %xmm7 -; SSE2-NEXT: movaps %xmm5, %xmm0 -; SSE2-NEXT: andps %xmm6, %xmm0 -; SSE2-NEXT: orps %xmm7, %xmm0 -; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: andps %xmm8, %xmm2 -; SSE2-NEXT: movhlps {{.*#+}} xmm5 = xmm5[1,1] -; SSE2-NEXT: andps %xmm6, %xmm5 -; SSE2-NEXT: orps %xmm2, %xmm5 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: movaps %xmm3, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] -; SSE2-NEXT: andps %xmm8, %xmm2 -; SSE2-NEXT: movaps %xmm1, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1,2,3] -; SSE2-NEXT: andps %xmm6, %xmm4 -; SSE2-NEXT: orps %xmm2, %xmm4 -; SSE2-NEXT: movaps %xmm3, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] -; SSE2-NEXT: andps %xmm8, %xmm2 -; SSE2-NEXT: movaps %xmm1, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1,2,3] -; SSE2-NEXT: andps %xmm6, %xmm5 -; SSE2-NEXT: orps %xmm2, %xmm5 -; SSE2-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE2-NEXT: movaps %xmm3, %xmm4 -; SSE2-NEXT: andps %xmm8, %xmm4 -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: andps %xmm6, %xmm2 -; SSE2-NEXT: orps %xmm4, %xmm2 -; SSE2-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] -; SSE2-NEXT: andps %xmm8, %xmm3 -; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE2-NEXT: andps %xmm6, %xmm1 +; SSE2-NEXT: movaps [[SIGNMASK2]](%rip), %xmm4 +; SSE2-NEXT: andps %xmm4, %xmm2 +; SSE2-NEXT: movaps [[MAGMASK2]](%rip), %xmm5 +; SSE2-NEXT: andps %xmm5, %xmm0 +; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm4, %xmm3 +; SSE2-NEXT: andps %xmm5, %xmm1 ; SSE2-NEXT: orps %xmm3, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; AVX-LABEL: v8f32: ; AVX: # BB#0: -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX-NEXT: vmovaps {{.*#+}} xmm2 = [-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00] -; AVX-NEXT: vandps %xmm2, %xmm4, %xmm5 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX-NEXT: vmovaps {{.*#+}} xmm3 -; AVX-NEXT: vandps %xmm3, %xmm6, %xmm7 -; AVX-NEXT: vorps %xmm5, %xmm7, %xmm8 -; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm4[1,1,3,3] -; AVX-NEXT: vandps %xmm2, %xmm7, %xmm7 -; AVX-NEXT: vmovshdup {{.*#+}} xmm5 = xmm6[1,1,3,3] -; AVX-NEXT: vandps %xmm3, %xmm5, %xmm5 -; AVX-NEXT: vorps %xmm7, %xmm5, %xmm5 -; AVX-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[2,3] -; AVX-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0] -; AVX-NEXT: vandpd %xmm2, %xmm7, %xmm7 -; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0] -; AVX-NEXT: vandpd %xmm3, %xmm5, %xmm5 -; AVX-NEXT: vorpd %xmm7, %xmm5, %xmm5 -; AVX-NEXT: vinsertps {{.*#+}} xmm5 = xmm8[0,1],xmm5[0],xmm8[3] -; AVX-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX-NEXT: vandps %xmm2, %xmm4, %xmm4 -; AVX-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX-NEXT: vandps %xmm3, %xmm6, %xmm6 -; AVX-NEXT: vorps %xmm4, %xmm6, %xmm4 -; AVX-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; AVX-NEXT: vandps %xmm2, %xmm1, %xmm5 -; AVX-NEXT: vandps %xmm3, %xmm0, %xmm6 -; AVX-NEXT: vorps %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3] -; AVX-NEXT: vandps %xmm2, %xmm6, %xmm6 -; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] -; AVX-NEXT: vandps %xmm3, %xmm7, %xmm7 -; AVX-NEXT: vorps %xmm6, %xmm7, %xmm6 -; AVX-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3] -; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0] -; AVX-NEXT: vandpd %xmm2, %xmm6, %xmm6 -; AVX-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0] -; AVX-NEXT: vandpd %xmm3, %xmm7, %xmm7 -; AVX-NEXT: vorpd %xmm6, %xmm7, %xmm6 -; AVX-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3] -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX-NEXT: vandps %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX-NEXT: vandps [[SIGNMASK2]](%rip), %ymm1, %ymm1 +; AVX-NEXT: vandps [[MAGMASK2]](%rip), %ymm0, %ymm0 +; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq ; %tmp = tail call <8 x float> @llvm.copysign.v8f32( <8 x float> %a, <8 x float> %b ) ret <8 x float> %tmp } +; CHECK: [[SIGNMASK3:L.+]]: +; CHECK-NEXT: .quad -9223372036854775808 +; CHECK-NEXT: .quad -9223372036854775808 + +; CHECK: [[MAGMASK3:L.+]]: +; CHECK-NEXT: .quad 9223372036854775807 +; CHECK-NEXT: .quad 9223372036854775807 + define <2 x double> @v2f64(<2 x double> %a, <2 x double> %b) nounwind { ; SSE2-LABEL: v2f64: ; SSE2: # BB#0: -; SSE2-NEXT: movaps {{.*#+}} xmm3 = [-0.000000e+00,-0.000000e+00] -; SSE2-NEXT: movaps %xmm1, %xmm4 -; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: movaps {{.*#+}} xmm5 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: andps %xmm5, %xmm2 -; SSE2-NEXT: orps %xmm4, %xmm2 -; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE2-NEXT: andps %xmm3, %xmm1 -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE2-NEXT: andps %xmm5, %xmm0 +; SSE2-NEXT: andps [[SIGNMASK3]](%rip), %xmm1 +; SSE2-NEXT: andps [[MAGMASK3]](%rip), %xmm0 ; SSE2-NEXT: orps %xmm1, %xmm0 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: movapd %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: v2f64: ; AVX: # BB#0: -; AVX-NEXT: vmovapd {{.*#+}} xmm2 = [-0.000000e+00,-0.000000e+00] -; AVX-NEXT: vandpd %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vmovapd {{.*#+}} xmm4 -; AVX-NEXT: vandpd %xmm4, %xmm0, %xmm5 -; AVX-NEXT: vorpd %xmm3, %xmm5, %xmm3 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX-NEXT: vandpd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-NEXT: vandpd %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX-NEXT: vandps [[SIGNMASK3]](%rip), %xmm1, %xmm1 +; AVX-NEXT: vandps [[MAGMASK3]](%rip), %xmm0, %xmm0 +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; %tmp = tail call <2 x double> @llvm.copysign.v2f64( <2 x double> %a, <2 x double> %b ) ret <2 x double> %tmp } +; SSE2: [[SIGNMASK4:L.+]]: +; SSE2-NEXT: .quad -9223372036854775808 +; SSE2-NEXT: .quad -9223372036854775808 + +; SSE2: [[MAGMASK4:L.+]]: +; SSE2-NEXT: .quad 9223372036854775807 +; SSE2-NEXT: .quad 9223372036854775807 + +; AVX: [[SIGNMASK4:L.+]]: +; AVX-NEXT: .quad -9223372036854775808 +; AVX-NEXT: .quad -9223372036854775808 +; AVX-NEXT: .quad -9223372036854775808 +; AVX-NEXT: .quad -9223372036854775808 + +; AVX: [[MAGMASK4:L.+]]: +; AVX-NEXT: .quad 9223372036854775807 +; AVX-NEXT: .quad 9223372036854775807 +; AVX-NEXT: .quad 9223372036854775807 +; AVX-NEXT: .quad 9223372036854775807 + define <4 x double> @v4f64(<4 x double> %a, <4 x double> %b) nounwind { ; SSE2-LABEL: v4f64: ; SSE2: # BB#0: -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: movaps {{.*#+}} xmm5 = [-0.000000e+00,-0.000000e+00] -; SSE2-NEXT: movaps %xmm2, %xmm6 -; SSE2-NEXT: andps %xmm5, %xmm6 -; SSE2-NEXT: movaps {{.*#+}} xmm7 -; SSE2-NEXT: andps %xmm7, %xmm0 -; SSE2-NEXT: orps %xmm6, %xmm0 -; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: andps %xmm5, %xmm2 -; SSE2-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1] -; SSE2-NEXT: andps %xmm7, %xmm4 -; SSE2-NEXT: orps %xmm2, %xmm4 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE2-NEXT: movaps %xmm3, %xmm4 -; SSE2-NEXT: andps %xmm5, %xmm4 -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: andps %xmm7, %xmm2 -; SSE2-NEXT: orps %xmm4, %xmm2 -; SSE2-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] -; SSE2-NEXT: andps %xmm5, %xmm3 -; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE2-NEXT: andps %xmm7, %xmm1 +; SSE2-NEXT: movaps [[SIGNMASK4]](%rip), %xmm4 +; SSE2-NEXT: andps %xmm4, %xmm2 +; SSE2-NEXT: movaps [[MAGMASK4]](%rip), %xmm5 +; SSE2-NEXT: andps %xmm5, %xmm0 +; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm4, %xmm3 +; SSE2-NEXT: andps %xmm5, %xmm1 ; SSE2-NEXT: orps %xmm3, %xmm1 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE2-NEXT: movapd %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; AVX-LABEL: v4f64: ; AVX: # BB#0: -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [-0.000000e+00,-0.000000e+00] -; AVX-NEXT: vandpd %xmm3, %xmm2, %xmm4 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX-NEXT: vmovapd {{.*#+}} xmm6 -; AVX-NEXT: vandpd %xmm6, %xmm5, %xmm7 -; AVX-NEXT: vorpd %xmm4, %xmm7, %xmm4 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX-NEXT: vandpd %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; AVX-NEXT: vandpd %xmm6, %xmm5, %xmm5 -; AVX-NEXT: vorpd %xmm2, %xmm5, %xmm2 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm4[0],xmm2[0] -; AVX-NEXT: vandpd %xmm3, %xmm1, %xmm4 -; AVX-NEXT: vandpd %xmm6, %xmm0, %xmm5 -; AVX-NEXT: vorpd %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX-NEXT: vandpd %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-NEXT: vandpd %xmm6, %xmm0, %xmm0 -; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm4[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vandps [[SIGNMASK4]](%rip), %ymm1, %ymm1 +; AVX-NEXT: vandps [[MAGMASK4]](%rip), %ymm0, %ymm0 +; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq ; %tmp = tail call <4 x double> @llvm.copysign.v4f64( <4 x double> %a, <4 x double> %b )