Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1697,6 +1697,193 @@ verifyIntrinsicTables(); } +typedef enum : unsigned { MMX = 0, XMM = 1, YMM = 3, ZMM = 7 } VecRegKind; +enum : unsigned { UNDEF, FRWD, BKWD }; + +static inline int GetLaneIndex(int Bits, int StartIdx = 0) { + return (((Bits + 64) >> 6) - 1) + StartIdx; +} + +/// Find the smallest sub-register which accommodates all the non-undefs +/// of a node, Lane granularity is taken as 64 bit (MMX). Lookup is +/// performed in both forward and backward directions. +static bool GetMinimalUsedSubReg(SmallBitVector &Lanes, int &ForwardSubReg, + VecRegKind &SubRegKind) { + if (Lanes.all() || Lanes.none()) { + ForwardSubReg = UNDEF; + SubRegKind = ZMM; + return false; + } + + auto GetSubReg = [&](bool ScanForward) -> VecRegKind { + VecRegKind SubReg = ZMM; + VecRegKind VecSubRegs[4] = {MMX, XMM, YMM, ZMM}; + for (int i = 0; i < 4; i++) { + int Checker = ScanForward ? Lanes.find_next(VecSubRegs[i]) + : Lanes.find_prev(ZMM - VecSubRegs[i]); + if (Checker == -1) { + SubReg = VecSubRegs[i]; + break; + } + } + return SubReg; + }; + + VecRegKind FrwdSubReg = GetSubReg(true); + VecRegKind BkwdSubReg = GetSubReg(false); + if (FrwdSubReg < BkwdSubReg) { + ForwardSubReg = FRWD; + SubRegKind = FrwdSubReg; + return true; + } else if (BkwdSubReg < FrwdSubReg) { + ForwardSubReg = BKWD; + SubRegKind = BkwdSubReg; + return true; + } + return false; +} + +// Granularity of the lane considered is 64 bit, mark a bit in the +// bitvector if corresponding lane is accessed. +static bool MarkUsedLanes(SDNode *N, SmallBitVector &Lanes, int StartIdx) { + bool retVal = false; + + switch (N->getOpcode()) { + default: { + int VTSz = N->getValueType(0).getSizeInBits(); + for (int i = StartIdx, e = GetLaneIndex(VTSz, StartIdx); i < e; i++) + Lanes[i] = 1; + } break; + case ISD::CONCAT_VECTORS: { + int SZInBits = 0; + for (auto &Oprnd : N->op_values()) { + if (!Oprnd.isUndef()) + retVal |= MarkUsedLanes(Oprnd.getNode(), Lanes, StartIdx); + SZInBits += Oprnd.getValueType().getSizeInBits(); + StartIdx = GetLaneIndex(SZInBits); + } + } break; + case ISD::VECTOR_SHUFFLE: { + ShuffleVectorSDNode *SV = dyn_cast(N); + EVT ElemTy = SV->getOperand(0).getValueType().getVectorElementType(); + int OperNumElems = SV->getOperand(0).getValueType().getVectorNumElements(); + int ElemSz = ElemTy.getSizeInBits(); + + bool OpersUndef[2] = {SV->getOperand(0).isUndef(), + SV->getOperand(1).isUndef()}; + + ArrayRef Mask = SV->getMask(); + for (int i = 0, e = Mask.size(); i < e; i++) { + if (Mask[i] >= 0 && !OpersUndef[Mask[i] >= OperNumElems]) + Lanes[GetLaneIndex(i * ElemSz, StartIdx)] = 1; + } + } break; + } + + if (Lanes.all()) + return true; + + return retVal; +} + +// A generic routine which checks if operands of a binary operation +// can be scaled down to a lower sub-register, also it sets the +// StartIdx (from where operand's extraction needs to start) +// NewOperVT (new value type of result) and PadVT (value type of +// padding for result). +static bool CheckDownScalingBinaryOperation(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + uint64_t &StartIdx, EVT &NewOperVT, + EVT &PadVT) { + SDLoc DL(N); + VecRegKind Op0SubReg, Op1SubReg; + int Op0FrwdSubReg, Op1FrwdSubReg; + + if (N->getNumOperands() != 2) + return false; + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + if (!Op0.getValueType().isVector() || !Op1.getValueType().isVector()) + return false; + + EVT OperVT = Op0.getValueType(); + int LaneSz = GetLaneIndex(OperVT.getSizeInBits()); + + SmallBitVector Op0Lanes(LaneSz, 0); + SmallBitVector Op1Lanes(LaneSz, 0); + + if (!OperVT.isSimple() || OperVT.getSizeInBits() < 64) + return false; + + EVT OperElemVT = OperVT.getVectorElementType(); + int OperNumElems = OperVT.getVectorNumElements(); + int OperElemSZ = OperElemVT.getSizeInBits(); + + // Mark bit corresponding to 64 bit lane if the particular + // lane is accessed by the node. + bool Op0FullUse = MarkUsedLanes(Op0.getNode(), Op0Lanes, 0); + bool Op1FullUse = MarkUsedLanes(Op1.getNode(), Op1Lanes, 0); + if (Op0FullUse && Op1FullUse) + return false; + + // Find the smallest sub-register which can accommodate + // non-undef part of operands. + bool Res0 = GetMinimalUsedSubReg(Op0Lanes, Op0FrwdSubReg, Op0SubReg); + bool Res1 = GetMinimalUsedSubReg(Op1Lanes, Op1FrwdSubReg, Op1SubReg); + if (!Res0 && !Res1) + return false; + + int OperSubReg = std::min(Op0SubReg, Op1SubReg); + int PadNumElems = + PowerOf2Floor(OperNumElems - ((OperSubReg + 1) * 64) / OperElemSZ); + int NewOperNumElems = OperNumElems - PadNumElems; + + if ((Op0FrwdSubReg && Op1FrwdSubReg && Op0FrwdSubReg != Op1FrwdSubReg) || + NewOperNumElems >= OperNumElems) + return false; + + // Legal direction for one of the operand could be UNDEF + // hence both operand direction are OR'ed to ascertain + // the actual direction of subreg. + int FrwdSubReg = Op0FrwdSubReg | Op1FrwdSubReg; + NewOperVT = EVT::getVectorVT(*DAG.getContext(), OperElemVT, NewOperNumElems); + PadVT = EVT::getVectorVT(*DAG.getContext(), OperElemVT, PadNumElems); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(NewOperVT) || !TLI.isTypeLegal(PadVT)) + return false; + + StartIdx = FrwdSubReg ? 0 : OperNumElems - NewOperNumElems; + return true; +} + + +static SDValue TryDownScalingBinaryOperation(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDLoc DL(N); + uint64_t StartIdx; + EVT PadVT, NewOperVT; + if (CheckDownScalingBinaryOperation(N, DAG, Subtarget, + StartIdx, NewOperVT, PadVT)) { + SDValue ConstOffset = DAG.getIntPtrConstant(StartIdx, DL); + SDValue NewOp0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewOperVT, + N->getOperand(0), ConstOffset); + SDValue NewOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewOperVT, + N->getOperand(1), ConstOffset); + SDValue NewN = DAG.getNode(N->getOpcode(), SDLoc(N), NewOperVT, + NewOp0, NewOp1); + SDValue ConcatOps[2] = {DAG.getUNDEF(PadVT), NewN}; + if (StartIdx == 0) { + ConcatOps[0] = NewN; + ConcatOps[1] = DAG.getUNDEF(PadVT); + } + return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), ConcatOps); + } + return SDValue(); +} + // This has so far only been implemented for 64-bit MachO. bool X86TargetLowering::useLoadStackGuardNode() const { return Subtarget.isTargetMachO() && Subtarget.is64Bit(); @@ -29980,6 +30167,32 @@ if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget)) return Cmp; + uint64_t StartIdx; + EVT PadVT, NewOperVT; + if (isa(EltIdx) && InputVector.hasOneUse() && + CheckDownScalingBinaryOperation(InputVector.getNode(), DAG, Subtarget, + StartIdx, NewOperVT, PadVT)) { + uint64_t EndIdx = StartIdx + NewOperVT.getVectorNumElements(); + uint64_t ExtractedElt = N->getConstantOperandVal(1); + + if (StartIdx >= ExtractedElt && ExtractedElt <= EndIdx) { + SDValue NewOp0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NewOperVT, + InputVector.getOperand(0), + DAG.getIntPtrConstant(StartIdx, dl)); + SDValue NewOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NewOperVT, + InputVector.getOperand(1), + DAG.getIntPtrConstant(StartIdx, dl)); + SDValue NewInVec = DAG.getNode(InputVector.getOpcode(), dl, + NewOperVT, NewOp0, NewOp1); + + SDValue NewEltIdx = + DAG.getConstant((ExtractedElt - StartIdx), dl, MVT::i32); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), + NewInVec, NewEltIdx); + } + } + // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. if (SrcVT != MVT::v4i32) @@ -33566,7 +33779,8 @@ /// set to A, RHS to B, and the routine returns 'true'. /// Note that the binary operation should have the property that if one of the /// operands is UNDEF then the result is UNDEF. -static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { +static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative, + SelectionDAG &DAG) { // Look for the following pattern: if // A = < float a0, float a1, float a2, float a3 > // B = < float b0, float b1, float b2, float b3 > @@ -33576,6 +33790,23 @@ // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > // which is A horizontal-op B. + bool ExtractLHSSubVec = false; + ConstantSDNode *CSD = nullptr; + if (LHS.getOpcode() == ISD::EXTRACT_SUBVECTOR && + (CSD = cast(LHS.getOperand(1))) && + CSD->getZExtValue() == 0) { + LHS = LHS.getOperand(0); + ExtractLHSSubVec = true; + } + + bool ExtractRHSSubVec = false; + if (RHS.getOpcode() == ISD::EXTRACT_SUBVECTOR && + (CSD = cast(RHS.getOperand(1))) && + CSD->getZExtValue() == 0) { + RHS = RHS.getOperand(0); + ExtractRHSSubVec = true; + } + // At least one of the operands should be a vector shuffle. if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && RHS.getOpcode() != ISD::VECTOR_SHUFFLE) @@ -33583,8 +33814,8 @@ MVT VT = LHS.getSimpleValueType(); - assert((VT.is128BitVector() || VT.is256BitVector()) && - "Unsupported vector type for horizontal add/sub"); + assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && + "Unsupported vector type for horizontal add/sub"); // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to // operate independently on 128-bit lanes. @@ -33652,9 +33883,9 @@ // LHS = VECTOR_SHUFFLE A, B, LMask // RHS = VECTOR_SHUFFLE A, B, RMask // Check that the masks correspond to performing a horizontal operation. - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned L = 0; L != NumElts; L += NumLaneElts) { for (unsigned i = 0; i != NumLaneElts; ++i) { - int LIdx = LMask[i+l], RIdx = RMask[i+l]; + int LIdx = LMask[i+L], RIdx = RMask[i+L]; // Ignore any UNDEF components. if (LIdx < 0 || RIdx < 0 || @@ -33665,7 +33896,7 @@ // Check that successive elements are being operated on. If not, this is // not a horizontal operation. unsigned Src = (i/HalfLaneElts); // each lane is split between srcs - int Index = 2*(i%HalfLaneElts) + NumElts*Src + l; + int Index = 2*(i%HalfLaneElts) + NumElts*Src + L; if (!(LIdx == Index && RIdx == Index + 1) && !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) return false; @@ -33674,6 +33905,22 @@ LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. + + if ( ExtractRHSSubVec || ExtractLHSSubVec) { + SDLoc DL(LHS.getNode()); + EVT VecVT = LHS.getValueType(); + EVT VecElemVT = VecVT.getVectorElementType(); + unsigned NumElems = VecVT.getVectorNumElements(); + EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(), VecElemVT, NumElems / 2); + SDValue Const0 = DAG.getIntPtrConstant(0, DL); + + if (ExtractLHSSubVec) + LHS = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, LHS, Const0); + + if (ExtractRHSSubVec) + RHS = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, RHS, Const0); + } + return true; } @@ -33686,13 +33933,18 @@ bool IsFadd = N->getOpcode() == ISD::FADD; assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode"); + auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB; + // Try to synthesize horizontal add/sub from adds/subs of shuffles. if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - isHorizontalBinOp(LHS, RHS, IsFadd)) { - auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB; + isHorizontalBinOp(LHS, RHS, IsFadd, DAG)) { return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS); } + + if (SDValue V = TryDownScalingBinaryOperation(N, DAG, Subtarget)) + return V; + return SDValue(); } @@ -35462,8 +35714,11 @@ return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec); } + static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + SDLoc DL(N); + const SDNodeFlags Flags = N->getFlags(); if (Flags.hasVectorReduction()) { if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget)) @@ -35478,13 +35733,19 @@ // Try to synthesize horizontal adds from adds of shuffles. if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && - isHorizontalBinOp(Op0, Op1, true)) + isHorizontalBinOp(Op0, Op1, true, DAG)) return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); if (SDValue V = combineIncDecVector(N, DAG)) return V; - return combineAddOrSubToADCOrSBB(N, DAG); + if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG)) + return V; + + if (SDValue V = TryDownScalingBinaryOperation(N, DAG, Subtarget)) + return V; + + return SDValue(); } static SDValue combineSub(SDNode *N, SelectionDAG &DAG, @@ -35514,13 +35775,19 @@ EVT VT = N->getValueType(0); if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && - isHorizontalBinOp(Op0, Op1, false)) + isHorizontalBinOp(Op0, Op1, false, DAG)) return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); if (SDValue V = combineIncDecVector(N, DAG)) return V; - return combineAddOrSubToADCOrSBB(N, DAG); + if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG)) + return V; + + if (SDValue V = TryDownScalingBinaryOperation(N, DAG, Subtarget)) + return V; + + return SDValue(); } static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG, Index: test/CodeGen/X86/avx512-hadd-hsub.ll =================================================================== --- test/CodeGen/X86/avx512-hadd-hsub.ll +++ test/CodeGen/X86/avx512-hadd-hsub.ll @@ -6,18 +6,16 @@ ; KNL-LABEL: hadd_16: ; KNL: # BB#0: ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; KNL-NEXT: vmovd %xmm0, %eax ; KNL-NEXT: retq ; ; SKX-LABEL: hadd_16: ; SKX: # BB#0: ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; SKX-NEXT: vmovd %xmm0, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -33,18 +31,16 @@ ; KNL-LABEL: hsub_16: ; KNL: # BB#0: ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; KNL-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vphsubd %ymm0, %ymm0, %ymm0 ; KNL-NEXT: vmovd %xmm0, %eax ; KNL-NEXT: retq ; ; SKX-LABEL: hsub_16: ; SKX: # BB#0: ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SKX-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vphsubd %ymm0, %ymm0, %ymm0 ; SKX-NEXT: vmovd %xmm0, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -60,19 +56,17 @@ ; KNL-LABEL: fhadd_16: ; KNL: # BB#0: ; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; KNL-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: fhadd_16: ; SKX: # BB#0: ; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> @@ -87,19 +81,17 @@ ; KNL-LABEL: fhsub_16: ; KNL: # BB#0: ; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; KNL-NEXT: vsubps %zmm1, %zmm0, %zmm0 -; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vhsubps %ymm0, %ymm0, %ymm0 +; KNL-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; KNL-NEXT: retq ; ; SKX-LABEL: fhsub_16: ; SKX: # BB#0: ; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SKX-NEXT: vsubps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vhsubps %ymm0, %ymm0, %ymm0 +; SKX-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> @@ -117,16 +109,12 @@ ; CHECK-NEXT: retq ; KNL-LABEL: hadd_16_3: ; KNL: # BB#0: -; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; KNL-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: hadd_16_3: ; SKX: # BB#0: -; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; SKX-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ; SKX-NEXT: retq %x226 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32> @@ -144,16 +132,12 @@ ; CHECK-NEXT: retq ; KNL-LABEL: fhadd_16_3: ; KNL: # BB#0: -; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; KNL-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fhadd_16_3: ; SKX: # BB#0: -; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; SKX-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; SKX-NEXT: retq %x226 = shufflevector <16 x float> %x225, <16 x float> %x227, <16 x i32> @@ -174,16 +158,12 @@ ; CHECK-NEXT: retq ; KNL-LABEL: fhadd_16_4: ; KNL: # BB#0: -; KNL-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; KNL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fhadd_16_4: ; SKX: # BB#0: -; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ; SKX-NEXT: retq %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> Index: test/CodeGen/X86/madd.ll =================================================================== --- test/CodeGen/X86/madd.ll +++ test/CodeGen/X86/madd.ll @@ -46,10 +46,10 @@ ; AVX2-NEXT: jne .LBB0_1 ; AVX2-NEXT: # BB#2: # %middle.block ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -70,10 +70,10 @@ ; AVX512-NEXT: jne .LBB0_1 ; AVX512-NEXT: # BB#2: # %middle.block ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -158,10 +158,10 @@ ; AVX2-NEXT: jne .LBB1_1 ; AVX2-NEXT: # BB#2: # %middle.block ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -183,10 +183,10 @@ ; AVX512-NEXT: jne .LBB1_1 ; AVX512-NEXT: # BB#2: # %middle.block ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -293,9 +293,9 @@ ; AVX2-NEXT: # BB#2: # %middle.block ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper @@ -318,13 +318,12 @@ ; AVX512-NEXT: jne .LBB2_1 ; AVX512-NEXT: # BB#2: # %middle.block ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq Index: test/CodeGen/X86/sad.ll =================================================================== --- test/CodeGen/X86/sad.ll +++ test/CodeGen/X86/sad.ll @@ -50,9 +50,9 @@ ; AVX2-NEXT: # BB#2: # %middle.block ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper @@ -73,13 +73,12 @@ ; AVX512F-NEXT: jne .LBB0_1 ; AVX512F-NEXT: # BB#2: # %middle.block ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -99,13 +98,12 @@ ; AVX512BW-NEXT: jne .LBB0_1 ; AVX512BW-NEXT: # BB#2: # %middle.block ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -297,10 +295,10 @@ ; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -322,13 +320,12 @@ ; AVX512F-NEXT: # BB#2: # %middle.block ; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -350,13 +347,12 @@ ; AVX512BW-NEXT: # BB#2: # %middle.block ; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -750,10 +746,10 @@ ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -795,13 +791,12 @@ ; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -824,13 +819,12 @@ ; AVX512BW-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1253,9 +1247,9 @@ ; AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; AVX2-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1265,9 +1259,9 @@ ; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512F-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1277,9 +1271,9 @@ ; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512BW-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq Index: test/CodeGen/X86/vector-compare-all_of.ll =================================================================== --- test/CodeGen/X86/vector-compare-all_of.ll +++ test/CodeGen/X86/vector-compare-all_of.ll @@ -68,7 +68,7 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -210,7 +210,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -358,7 +358,7 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -531,7 +531,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -690,7 +690,7 @@ ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: %AX %AX %EAX ; AVX1-NEXT: vzeroupper @@ -718,7 +718,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: %AX %AX %EAX ; AVX512-NEXT: vzeroupper @@ -894,7 +894,7 @@ ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax ; AVX1-NEXT: # kill: %AL %AL %EAX ; AVX1-NEXT: vzeroupper @@ -924,7 +924,7 @@ ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: %AL %AL %EAX ; AVX512-NEXT: vzeroupper Index: test/CodeGen/X86/vector-compare-any_of.ll =================================================================== --- test/CodeGen/X86/vector-compare-any_of.ll +++ test/CodeGen/X86/vector-compare-any_of.ll @@ -66,7 +66,7 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -196,7 +196,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -336,7 +336,7 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -493,7 +493,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -640,7 +640,7 @@ ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: %AX %AX %EAX ; AVX1-NEXT: vzeroupper @@ -667,7 +667,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: %AX %AX %EAX ; AVX512-NEXT: vzeroupper @@ -831,7 +831,7 @@ ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax ; AVX1-NEXT: # kill: %AL %AL %EAX ; AVX1-NEXT: vzeroupper @@ -860,7 +860,7 @@ ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: %AL %AL %EAX ; AVX512-NEXT: vzeroupper