Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1700,6 +1700,297 @@ verifyIntrinsicTables(); } +typedef enum : unsigned { MMX = 0, XMM = 1, YMM = 3, ZMM = 7 } VecRegKind; +enum : unsigned { UNDEF, FRWD, BKWD }; + +static inline int GetLaneIndex(int Bits, int StartIdx = 0) { + return (((Bits + 64) >> 6) - 1) + StartIdx; +} + +/// Find the smallest sub-register which accommodates all the non-undefs +/// of a node, Lane granularity is taken as 64 bit (MMX). Lookup is +/// performed in both forward and backward directions. +static bool GetMinimalUsedSubReg(SmallBitVector &Lanes, int &ForwardSubReg, + VecRegKind &SubRegKind) { + if (Lanes.all() || Lanes.none()) { + ForwardSubReg = UNDEF; + SubRegKind = ZMM; + return false; + } + + auto GetSubReg = [&](bool ScanForward) -> VecRegKind { + VecRegKind SubReg = ZMM; + VecRegKind VecSubRegs[4] = {MMX, XMM, YMM, ZMM}; + for (int i = 0; i < 4; i++) { + int Checker = ScanForward ? Lanes.find_next(VecSubRegs[i]) + : Lanes.find_prev(ZMM - VecSubRegs[i]); + if (Checker == -1) { + SubReg = VecSubRegs[i]; + break; + } + } + return SubReg; + }; + + VecRegKind FrwdSubReg = GetSubReg(true); + VecRegKind BkwdSubReg = GetSubReg(false); + if (FrwdSubReg < BkwdSubReg) { + ForwardSubReg = FRWD; + SubRegKind = FrwdSubReg; + return true; + } else if (BkwdSubReg < FrwdSubReg) { + ForwardSubReg = BKWD; + SubRegKind = BkwdSubReg; + return true; + } + return false; +} + +// Granularity of the lane considered is 64 bit, mark a bit in the +// bitvector if corresponding lane is accessed. +static bool MarkLanesUsedByOperands(SDNode *N, SmallBitVector &Lanes, + int StartIdx) { + bool retVal = false; + + switch (N->getOpcode()) { + default: { + EVT VT = N->getValueType(0); + if (VT.isSimple() && VT.isVector()) { + int VTSz = VT.getSizeInBits(); + for (int i = StartIdx, e = GetLaneIndex(VTSz, StartIdx); i < e; i++) + Lanes[i] = 1; + } else { + // Mark all the lanes used in default case. + Lanes.set(0,Lanes.size()); + } + } break; + case ISD::CONCAT_VECTORS: { + int SZInBits = 0; + for (auto &Oprnd : N->op_values()) { + if (!Oprnd.isUndef()) + retVal |= MarkLanesUsedByOperands(Oprnd.getNode(), Lanes, StartIdx); + SZInBits += Oprnd.getValueType().getSizeInBits(); + StartIdx = GetLaneIndex(SZInBits); + } + } break; + case ISD::VECTOR_SHUFFLE: { + ShuffleVectorSDNode *SV = dyn_cast(N); + EVT ElemTy = SV->getOperand(0).getValueType().getVectorElementType(); + int OperNumElems = SV->getOperand(0).getValueType().getVectorNumElements(); + int ElemSz = ElemTy.getSizeInBits(); + + bool OpersUndef[2] = {SV->getOperand(0).isUndef(), + SV->getOperand(1).isUndef()}; + + ArrayRef Mask = SV->getMask(); + for (int i = 0, e = Mask.size(); i < e; i++) { + if (Mask[i] >= 0 && !OpersUndef[Mask[i] >= OperNumElems]) + Lanes[GetLaneIndex(i * ElemSz, StartIdx)] = 1; + } + } break; + } + + if (Lanes.all()) + return true; + + return retVal; +} + +// A generic routine which checks if operands of a binary operation +// can be scaled down to a lower sub-register, also it sets the +// StartIdx (from where operand's extraction needs to start) +// NewOperVT (new value type of result) and PadVT (value type of +// padding for result). +static bool CheckDownScalingBinOpByOperands(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + uint64_t &StartIdx, EVT &NewOperVT, + EVT &PadVT) { + SDLoc DL(N); + VecRegKind Op0SubReg, Op1SubReg; + int Op0FrwdSubReg, Op1FrwdSubReg; + + if (N->getNumOperands() != 2) + return false; + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + if (!Op0.getValueType().isVector() || !Op1.getValueType().isVector()) + return false; + + EVT OperVT = Op0.getValueType(); + int LaneSz = GetLaneIndex(OperVT.getSizeInBits()); + + SmallBitVector Op0Lanes(LaneSz, 0); + SmallBitVector Op1Lanes(LaneSz, 0); + + if (!OperVT.isSimple() || OperVT.getSizeInBits() < 64) + return false; + + EVT OperElemVT = OperVT.getVectorElementType(); + int OperNumElems = OperVT.getVectorNumElements(); + int OperElemSZ = OperElemVT.getSizeInBits(); + + // Mark bit corresponding to 64 bit lane if the particular + // lane is accessed by the node. + bool Op0FullUse = MarkLanesUsedByOperands(Op0.getNode(), Op0Lanes, 0); + bool Op1FullUse = MarkLanesUsedByOperands(Op1.getNode(), Op1Lanes, 0); + if (Op0FullUse && Op1FullUse) + return false; + + // Find the smallest sub-register which can accommodate + // non-undef part of operands. + bool Res0 = GetMinimalUsedSubReg(Op0Lanes, Op0FrwdSubReg, Op0SubReg); + bool Res1 = GetMinimalUsedSubReg(Op1Lanes, Op1FrwdSubReg, Op1SubReg); + if (!Res0 && !Res1) + return false; + + int OperSubReg = std::min(Op0SubReg, Op1SubReg); + int PadNumElems = + PowerOf2Floor(OperNumElems - ((OperSubReg + 1) * 64) / OperElemSZ); + int NewOperNumElems = OperNumElems - PadNumElems; + + if ((Op0FrwdSubReg && Op1FrwdSubReg && Op0FrwdSubReg != Op1FrwdSubReg) || + NewOperNumElems >= OperNumElems) + return false; + + // Legal direction for one of the operand could be UNDEF + // hence both operand direction are OR'ed to ascertain + // the actual direction of subreg. + int FrwdSubReg = Op0FrwdSubReg | Op1FrwdSubReg; + NewOperVT = EVT::getVectorVT(*DAG.getContext(), OperElemVT, NewOperNumElems); + PadVT = EVT::getVectorVT(*DAG.getContext(), OperElemVT, PadNumElems); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(NewOperVT) || !TLI.isTypeLegal(PadVT)) + return false; + + StartIdx = FrwdSubReg == FRWD ? 0 : PadNumElems; + return true; +} + +static bool MarkLanesUsedByUsers(SDNode *N, SmallBitVector &Lanes) { + unsigned ElemSz = 0; + int64_t SubVecSize = 1; + + switch (N->getOpcode()) { + default: { + EVT VT = N->getValueType(0); + if (VT.isSimple() && VT.isVector()) { + int VTSz = VT.getSizeInBits(); + for (int i = 0, e = GetLaneIndex(VTSz); i < e; i++) + Lanes[i] = 1; + } else { + // Mark all the lanes used in default case. + Lanes.set(0,Lanes.size()); + } + } break; + + case ISD::EXTRACT_SUBVECTOR: { + SubVecSize = N->getValueType(0).getVectorNumElements(); + ElemSz = N->getValueType(0).getVectorElementType().getSizeInBits(); + } + case ISD::EXTRACT_VECTOR_ELT: { + SDValue Idx = N->getOperand(1); + ElemSz = ElemSz ? ElemSz : N->getValueType(0).getSizeInBits(); + if (!isa(Idx)) + return false; + + int64_t StartIdx = + (dyn_cast(Idx.getNode()))->getSExtValue(); + int64_t EndIdx = StartIdx + SubVecSize - 1; + for (int i = StartIdx, e = EndIdx; i <= e; i++) + Lanes[GetLaneIndex(i * ElemSz)] = 1; + } break; + + case ISD::VECTOR_SHUFFLE: { + ShuffleVectorSDNode *SV = dyn_cast(N); + EVT ElemTy = SV->getOperand(0).getValueType().getVectorElementType(); + int OperNumElems = SV->getOperand(0).getValueType().getVectorNumElements(); + int ElemSz = ElemTy.getSizeInBits(); + ArrayRef Mask = SV->getMask(); + + bool FirstOprd = SV->getOperand(0).getNode() == N; + bool UsedOprd[2] = {FirstOprd, !FirstOprd}; + + // Mark lane bits of node N which are used by + // the shuffle vector. + for (int i = 0, e = Mask.size(); i < e; i++) + if (Mask[i] >= 0 && UsedOprd[Mask[i] >= OperNumElems]) + Lanes[GetLaneIndex(i * ElemSz)] = 1; + } break; + } + + if (Lanes.all()) + return false; + + return true; +} + +static bool CheckDownScalingBinOpByUses(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + uint64_t &StartIdx, EVT &NewVT, + EVT &PadVT) { + int FrwdSubReg; + VecRegKind SubReg; + EVT VT = N->getValueType(0); + + if (!VT.isSimple() || VT.getSizeInBits() < 64) + return false; + + int LaneSz = GetLaneIndex(VT.getSizeInBits()); + SmallBitVector Lanes(LaneSz, 0); + + for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; + ++UI) + if (!MarkLanesUsedByUsers(*UI, Lanes)) + return false; + + if (!GetMinimalUsedSubReg(Lanes, FrwdSubReg, SubReg)) + return false; + + EVT ElemVT = VT.getVectorElementType(); + int ElemSZ = ElemVT.getSizeInBits(); + int NumElems = VT.getVectorNumElements(); + + int PadNumElems = PowerOf2Floor(NumElems - ((SubReg + 1) * 64) / ElemSZ); + int NewNumElems = NumElems - PadNumElems; + + if (NewNumElems >= NumElems) + return false; + + NewVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NewNumElems); + PadVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, PadNumElems); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(NewVT) || !TLI.isTypeLegal(PadVT)) + return false; + + StartIdx = FrwdSubReg == FRWD ? 0 : PadNumElems; + return true; +} + +static SDValue CreateScaledDownBinOper(SDLoc &DL, EVT VT, SDValue Op0, + SDValue Op1, SelectionDAG &DAG, + unsigned OpCode, uint64_t StartIdx, + EVT OperVT, EVT PadVT, + const X86Subtarget &Subtarget) { + SDValue ConstOffset = DAG.getIntPtrConstant(StartIdx, DL); + SDValue NewOp0 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OperVT, Op0, ConstOffset); + + SDValue NewOp1 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OperVT, Op1, ConstOffset); + + SDValue NewN = DAG.getNode(OpCode, DL, OperVT, NewOp0, NewOp1); + SDValue ConcatOps[2] = {DAG.getUNDEF(PadVT), NewN}; + if (StartIdx == 0) { + ConcatOps[0] = NewN; + ConcatOps[1] = DAG.getUNDEF(PadVT); + } + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); +} + // This has so far only been implemented for 64-bit MachO. bool X86TargetLowering::useLoadStackGuardNode() const { return Subtarget.isTargetMachO() && Subtarget.is64Bit(); @@ -33529,7 +33820,8 @@ /// set to A, RHS to B, and the routine returns 'true'. /// Note that the binary operation should have the property that if one of the /// operands is UNDEF then the result is UNDEF. -static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { +static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative, + SelectionDAG &DAG) { // Look for the following pattern: if // A = < float a0, float a1, float a2, float a3 > // B = < float b0, float b1, float b2, float b3 > @@ -33544,10 +33836,12 @@ RHS.getOpcode() != ISD::VECTOR_SHUFFLE) return false; - MVT VT = LHS.getSimpleValueType(); + if (!LHS.getValueType().isSimple() || !RHS.getValueType().isSimple()) + return false; - assert((VT.is128BitVector() || VT.is256BitVector()) && - "Unsupported vector type for horizontal add/sub"); + MVT VT = LHS.getSimpleValueType(); + if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) + return false; // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to // operate independently on 128-bit lanes. @@ -33615,9 +33909,9 @@ // LHS = VECTOR_SHUFFLE A, B, LMask // RHS = VECTOR_SHUFFLE A, B, RMask // Check that the masks correspond to performing a horizontal operation. - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned L = 0; L != NumElts; L += NumLaneElts) { for (unsigned i = 0; i != NumLaneElts; ++i) { - int LIdx = LMask[i+l], RIdx = RMask[i+l]; + int LIdx = LMask[i+L], RIdx = RMask[i+L]; // Ignore any UNDEF components. if (LIdx < 0 || RIdx < 0 || @@ -33628,7 +33922,7 @@ // Check that successive elements are being operated on. If not, this is // not a horizontal operation. unsigned Src = (i/HalfLaneElts); // each lane is split between srcs - int Index = 2*(i%HalfLaneElts) + NumElts*Src + l; + int Index = 2*(i%HalfLaneElts) + NumElts*Src + L; if (!(LIdx == Index && RIdx == Index + 1) && !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) return false; @@ -33637,25 +33931,72 @@ LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. + return true; } +// Combining DAG of ADD/SUB with shuffles to Horizontal Operations. +static SDValue combineToHorizontalOperation(SDNode *N, bool IsIntegeralOp, + bool IsCommutative, unsigned OpCode, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + auto ValidHorizontalTypes = [&](EVT ValTyp) -> bool { + if (IsIntegeralOp) + return ((Subtarget.hasSSSE3() && + (ValTyp == MVT::v8i16 || ValTyp == MVT::v4i32)) || + (Subtarget.hasInt256() && + (ValTyp == MVT::v16i16 || ValTyp == MVT::v8i32))); + else + return ((Subtarget.hasSSE3() && + (ValTyp == MVT::v4f32 || ValTyp == MVT::v2f64)) || + (Subtarget.hasFp256() && + (ValTyp == MVT::v8f32 || ValTyp == MVT::v4f64))); + }; + + // Try to synthesize horizontal [f]add/[f]sub from adds of shuffles. + bool ValidHorizontalPattern = isHorizontalBinOp(Op0, Op1, IsCommutative, DAG); + + if (ValidHorizontalPattern && ValidHorizontalTypes(VT)) + return DAG.getNode(OpCode, DL, VT, Op0, Op1); + + if (ValidHorizontalPattern && VT.is512BitVector()) { + uint64_t StartIdx; + EVT NewOperVT, PadVT; + + if (CheckDownScalingBinOpByUses(N, DAG, Subtarget, StartIdx, NewOperVT, + PadVT) && + ValidHorizontalTypes(NewOperVT)) + return CreateScaledDownBinOper(DL, VT, Op0, Op1, DAG, OpCode, StartIdx, + NewOperVT, PadVT, Subtarget); + else if (CheckDownScalingBinOpByOperands(N, DAG, Subtarget, StartIdx, + NewOperVT, PadVT) && + ValidHorizontalTypes(NewOperVT)) + return CreateScaledDownBinOper(DL, VT, Op0, Op1, DAG, OpCode, StartIdx, + NewOperVT, PadVT, Subtarget); + else { + // Not creating multiple Horizontal add/sub due to latency considerations. + } + } + + return SDValue(); +} + /// Do target-specific dag combines on floating-point adds/subs. static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - EVT VT = N->getValueType(0); - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); bool IsFadd = N->getOpcode() == ISD::FADD; assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode"); - // Try to synthesize horizontal add/sub from adds/subs of shuffles. - if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || - (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - isHorizontalBinOp(LHS, RHS, IsFadd)) { - auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB; - return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS); - } + auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB; + if (SDValue V = combineToHorizontalOperation(N, false, IsFadd, NewOpcode, + Subtarget, DAG)) + return V; + return SDValue(); } @@ -35564,6 +35905,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + SDLoc DL(N); + const SDNodeFlags Flags = N->getFlags(); if (Flags.hasVectorReduction()) { if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget)) @@ -35571,20 +35914,18 @@ if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget)) return MAdd; } - EVT VT = N->getValueType(0); - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - // Try to synthesize horizontal adds from adds of shuffles. - if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || - (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && - isHorizontalBinOp(Op0, Op1, true)) - return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); + if (SDValue V = combineToHorizontalOperation(N, true, true, X86ISD::HADD, + Subtarget, DAG)) + return V; if (SDValue V = combineIncDecVector(N, DAG)) return V; - return combineAddOrSubToADCOrSBB(N, DAG); + if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG)) + return V; + + return SDValue(); } static SDValue combineSub(SDNode *N, SelectionDAG &DAG, @@ -35610,17 +35951,17 @@ } } - // Try to synthesize horizontal subs from subs of shuffles. - EVT VT = N->getValueType(0); - if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || - (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && - isHorizontalBinOp(Op0, Op1, false)) - return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); + if (SDValue V = combineToHorizontalOperation(N, true, false, X86ISD::HSUB, + Subtarget, DAG)) + return V; if (SDValue V = combineIncDecVector(N, DAG)) return V; - return combineAddOrSubToADCOrSBB(N, DAG); + if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG)) + return V; + + return SDValue(); } static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG, Index: test/CodeGen/X86/avx512-hadd-hsub.ll =================================================================== --- test/CodeGen/X86/avx512-hadd-hsub.ll +++ test/CodeGen/X86/avx512-hadd-hsub.ll @@ -7,8 +7,7 @@ ; KNL: # BB#0: ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; KNL-NEXT: vmovd %xmm0, %eax ; KNL-NEXT: retq ; @@ -16,8 +15,7 @@ ; SKX: # BB#0: ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; SKX-NEXT: vmovd %xmm0, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -34,8 +32,7 @@ ; KNL: # BB#0: ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; KNL-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vphsubd %ymm0, %ymm0, %ymm0 ; KNL-NEXT: vmovd %xmm0, %eax ; KNL-NEXT: retq ; @@ -43,8 +40,7 @@ ; SKX: # BB#0: ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SKX-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vphsubd %ymm0, %ymm0, %ymm0 ; SKX-NEXT: vmovd %xmm0, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -61,8 +57,7 @@ ; KNL: # BB#0: ; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vhaddps %ymm0, %ymm0, %ymm0 ; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; KNL-NEXT: retq ; @@ -70,8 +65,7 @@ ; SKX: # BB#0: ; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 ; SKX-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -88,8 +82,7 @@ ; KNL: # BB#0: ; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; KNL-NEXT: vsubps %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vhsubps %ymm0, %ymm0, %ymm0 ; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; KNL-NEXT: retq ; @@ -97,8 +90,7 @@ ; SKX: # BB#0: ; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SKX-NEXT: vsubps %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vhsubps %ymm0, %ymm0, %ymm0 ; SKX-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -113,16 +105,12 @@ define <16 x i32> @hadd_16_3(<16 x i32> %x225, <16 x i32> %x227) { ; KNL-LABEL: hadd_16_3: ; KNL: # BB#0: -; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; KNL-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: hadd_16_3: ; SKX: # BB#0: -; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; SKX-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ; SKX-NEXT: retq %x226 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32> @@ -136,16 +124,12 @@ define <16 x float> @fhadd_16_3(<16 x float> %x225, <16 x float> %x227) { ; KNL-LABEL: fhadd_16_3: ; KNL: # BB#0: -; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; KNL-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fhadd_16_3: ; SKX: # BB#0: -; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; SKX-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; SKX-NEXT: retq %x226 = shufflevector <16 x float> %x225, <16 x float> %x227, <16 x i32> @@ -158,16 +142,12 @@ define <8 x double> @fhadd_16_4(<8 x double> %x225, <8 x double> %x227) { ; KNL-LABEL: fhadd_16_4: ; KNL: # BB#0: -; KNL-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; KNL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fhadd_16_4: ; SKX: # BB#0: -; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ; SKX-NEXT: retq %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> @@ -178,18 +158,12 @@ define <4 x double> @fadd_noundef_low(<8 x double> %x225, <8 x double> %x227) { ; KNL-LABEL: fadd_noundef_low: ; KNL: # BB#0: -; KNL-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; KNL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0 -; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fadd_noundef_low: ; SKX: # BB#0: -; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0 -; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; SKX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ; SKX-NEXT: retq %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> @@ -201,18 +175,16 @@ define <4 x double> @fadd_noundef_high(<8 x double> %x225, <8 x double> %x227) { ; KNL-LABEL: fadd_noundef_high: ; KNL: # BB#0: -; KNL-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; KNL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 ; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fadd_noundef_high: ; SKX: # BB#0: -; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vextractf64x4 $1, %zmm1, %ymm1 ; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; SKX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ; SKX-NEXT: retq %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> @@ -225,18 +197,12 @@ define <8 x i32> @hadd_16_3_sv(<16 x i32> %x225, <16 x i32> %x227) { ; KNL-LABEL: hadd_16_3_sv: ; KNL: # BB#0: -; KNL-NEXT: vshufps {{.*#+}} zmm2 = zmm0[0,2],zmm1[0,2],zmm0[4,6],zmm1[4,6],zmm0[8,10],zmm1[8,10],zmm0[12,14],zmm1[12,14] -; KNL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[1,3],zmm1[1,3],zmm0[5,7],zmm1[5,7],zmm0[9,11],zmm1[9,11],zmm0[13,15],zmm1[13,15] -; KNL-NEXT: vpaddd %zmm0, %zmm2, %zmm0 -; KNL-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: hadd_16_3_sv: ; SKX: # BB#0: -; SKX-NEXT: vshufps {{.*#+}} zmm2 = zmm0[0,2],zmm1[0,2],zmm0[4,6],zmm1[4,6],zmm0[8,10],zmm1[8,10],zmm0[12,14],zmm1[12,14] -; SKX-NEXT: vshufps {{.*#+}} zmm0 = zmm0[1,3],zmm1[1,3],zmm0[5,7],zmm1[5,7],zmm0[9,11],zmm1[9,11],zmm0[13,15],zmm1[13,15] -; SKX-NEXT: vpaddd %zmm0, %zmm2, %zmm0 -; SKX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; SKX-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ; SKX-NEXT: retq %x226 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32> @@ -252,17 +218,13 @@ define double @fadd_noundef_eel(<8 x double> %x225, <8 x double> %x227) { ; KNL-LABEL: fadd_noundef_eel: ; KNL: # BB#0: -; KNL-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; KNL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: fadd_noundef_eel: ; SKX: # BB#0: -; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ; SKX-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -278,18 +240,20 @@ define double @fsub_noundef_ee (<8 x double> %x225, <8 x double> %x227) { ; KNL-LABEL: fsub_noundef_ee: ; KNL: # BB#0: -; KNL-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; KNL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; KNL-NEXT: vsubpd %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 +; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: vextractf32x4 $2, %zmm0, %xmm0 ; KNL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; KNL-NEXT: retq ; ; SKX-LABEL: fsub_noundef_ee: ; SKX: # BB#0: -; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; SKX-NEXT: vsubpd %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vextractf64x4 $1, %zmm1, %ymm1 +; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; SKX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ; SKX-NEXT: vextractf32x4 $2, %zmm0, %xmm0 ; SKX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; SKX-NEXT: vzeroupper Index: test/CodeGen/X86/madd.ll =================================================================== --- test/CodeGen/X86/madd.ll +++ test/CodeGen/X86/madd.ll @@ -323,8 +323,7 @@ ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq Index: test/CodeGen/X86/sad.ll =================================================================== --- test/CodeGen/X86/sad.ll +++ test/CodeGen/X86/sad.ll @@ -78,8 +78,7 @@ ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -104,8 +103,7 @@ ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -327,8 +325,7 @@ ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -355,8 +352,7 @@ ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -800,8 +796,7 @@ ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -829,8 +824,7 @@ ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq