Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -31710,6 +31710,70 @@ return SDValue(); } +// Operations that we can recognize as part of a horizontal reduction. +static bool isHorizontalReductionOpcode(unsigned Opcode) { + switch (Opcode) { + case ISD::ADD: + case ISD::UMAX: case ISD::UMIN: + case ISD::SMAX: case ISD::SMIN: + case ISD::FADD: + case X86ISD::FMAX: case X86ISD::FMIN: + case X86ISD::FMAXC: case X86ISD::FMINC: + return true; + } + + return false; +} + +// If we are looking at part of a horizontal reduction, determine its output +// size. +static int isHorizontalReductionStep(SDValue Root) { + unsigned Opcode = Root.getOpcode(); + + if (!isHorizontalReductionOpcode(Opcode)) + return -1; + + SDValue LHS = Root.getOperand(0); + SDValue RHS = Root.getOperand(1); + + // Canonicalize op to LHS. + if (RHS.getOpcode() == Opcode) + std::swap(LHS, RHS); + + if (LHS.getOpcode() != Opcode || + RHS.getOpcode() != ISD::VECTOR_SHUFFLE) + return -1; + + ArrayRef Mask = cast(RHS)->getMask(); + + // Make sure we have on input and its the same as LHS. + if (RHS.getOperand(0) != LHS) + return -1; + + // The mask should be shifting over elements. Find the starting index. + int StartIdx = Mask[0]; + // Undef or shift of 0 isn't interesting. + if (StartIdx <= 0) + return -1; + + // Make sure the start index is no larger than half the mask or we'll go + // out of bounds. + if (StartIdx * 2 > (int)Mask.size()) + return false; + + // Make sure the first StartIdx elements are contiguous. + for (int i = 0; i != StartIdx; ++i) + if (Mask[i] != StartIdx + i) + return false; + + // Make sure the rest are undef. + for (int i = StartIdx; i != (int)Mask.size(); ++i) + if (Mask[i] >= 0) + return false; + + return StartIdx; +} + /// Detect vector gather/scatter index generation and convert it from being a /// bunch of shuffles and extracts into a somewhat faster sequence. /// For i686, the best sequence is apparently storing the value and loading @@ -31781,6 +31845,21 @@ if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget)) return MinMax; + // If this is an extract from element 0 of a 256/512 bit vector that is + // the root of a horizontal reduction, insert an extract_subvector to kick + // off a narrowing process. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.isTypeLegal(SrcVT) && isa(EltIdx) && + cast(EltIdx)->getZExtValue() == 0 && + InputVector.hasOneUse() && SrcVT.getSizeInBits() > 128 && + isHorizontalReductionStep(InputVector) == 1) { + MVT EltVT = SrcVT.getSimpleVT().getVectorElementType(); + MVT SubVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits()); + SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, + InputVector, DAG.getIntPtrConstant(0, dl)); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, SubVec, EltIdx); + } + return SDValue(); } @@ -38481,15 +38560,87 @@ return SDValue(); } +// Narrows a step in a horizontal reduction. +// For example: +// extract_subvector (binop X, (vector_shuffle X, (4, 5, 6, 7, -1, -1, -1, -1)) +// where X is another binop with the same opcode. +// We can put an extract_subvector on X and narrow this binop and the shuffle +// based on the elements not being needed. This can allow us to start with a +// 512 bit binop, and narrower to 256 and 128 bits as the reduction gets +// smaller. The 128/256 bit operations can potentially use a smaller VEX +// encoding. +static SDValue narrowHorizontalReductionStep(MVT VT, SDValue InVec, + SelectionDAG &DAG) { + int ReductionSize = isHorizontalReductionStep(InVec); + if (ReductionSize < 0) + return SDValue(); + + unsigned RequiredElts = ReductionSize * 2; + RequiredElts = std::max(RequiredElts, VT.getVectorNumElements()); + + unsigned NewWidth = RequiredElts * VT.getScalarSizeInBits(); + NewWidth = alignTo(NewWidth, 128); + + unsigned OpWidth = InVec.getSimpleValueType().getSizeInBits(); + if (NewWidth >= OpWidth || OpWidth % NewWidth != 0) + return SDValue(); + + unsigned NewNumElts = NewWidth / VT.getScalarSizeInBits(); + MVT NewVT = MVT::getVectorVT(VT.getVectorElementType(), NewNumElts); + + unsigned Opcode = InVec.getOpcode(); + + SDValue LHS = InVec.getOperand(0); + SDValue RHS = InVec.getOperand(1); + + // Canonicalize. + if (RHS.getOpcode() == Opcode) + std::swap(LHS, RHS); + + if (LHS.getOpcode() != Opcode || + RHS.getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + + ArrayRef Mask = cast(RHS)->getMask(); + + SDLoc dl(InVec); + LHS = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NewVT, LHS, + DAG.getIntPtrConstant(0, dl)); + RHS = DAG.getVectorShuffle(NewVT, dl, LHS, LHS, Mask.slice(0, NewNumElts)); + SDValue Op = DAG.getNode(Opcode, dl, NewVT, LHS, RHS); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op, + DAG.getIntPtrConstant(0, dl)); +} + static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { - if (DCI.isBeforeLegalizeOps()) + EVT VT = N->getValueType(0); + SDValue InVec = N->getOperand(0); + + // Only handle constant indices. + auto *Idx = dyn_cast(N->getOperand(1)); + if (!Idx) return SDValue(); - MVT OpVT = N->getSimpleValueType(0); - SDValue InVec = N->getOperand(0); - unsigned IdxVal = cast(N->getOperand(1))->getZExtValue(); + unsigned IdxVal = Idx->getZExtValue(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (IdxVal == 0 && TLI.isTypeLegal(VT) && + TLI.isTypeLegal(InVec.getValueType()) && InVec.hasOneUse()) { + if (SDValue V = narrowHorizontalReductionStep(VT.getSimpleVT(), InVec, DAG)) + return V; + } + + // TODO: This code is needed anymore. Generic DAG combine does this better. + + if (!VT.isSimple()) + return SDValue(); + + MVT OpVT = VT.getSimpleVT(); + + if (DCI.isBeforeLegalizeOps()) + return SDValue(); if (ISD::isBuildVectorAllZeros(InVec.getNode())) return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N)); Index: test/CodeGen/X86/avx512-hadd-hsub.ll =================================================================== --- test/CodeGen/X86/avx512-hadd-hsub.ll +++ test/CodeGen/X86/avx512-hadd-hsub.ll @@ -7,8 +7,7 @@ ; KNL: # %bb.0: ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; KNL-NEXT: vmovd %xmm0, %eax ; KNL-NEXT: retq ; @@ -16,8 +15,7 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; SKX-NEXT: vmovd %xmm0, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -61,18 +59,14 @@ ; KNL: # %bb.0: ; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; KNL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; KNL-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fhadd_16: ; SKX: # %bb.0: ; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; SKX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> Index: test/CodeGen/X86/horizontal-reduce-smax.ll =================================================================== --- test/CodeGen/X86/horizontal-reduce-smax.ll +++ test/CodeGen/X86/horizontal-reduce-smax.ll @@ -469,10 +469,7 @@ ; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX1-NEXT: vzeroupper @@ -484,8 +481,8 @@ ; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper @@ -548,10 +545,7 @@ ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq @@ -562,8 +556,8 @@ ; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -573,7 +567,7 @@ ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -639,7 +633,7 @@ ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -695,7 +689,7 @@ ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -707,7 +701,7 @@ ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1283,10 +1277,7 @@ ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq @@ -1299,8 +1290,8 @@ ; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1312,7 +1303,7 @@ ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1395,9 +1386,9 @@ ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1467,9 +1458,9 @@ ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1483,7 +1474,7 @@ ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq Index: test/CodeGen/X86/horizontal-reduce-smin.ll =================================================================== --- test/CodeGen/X86/horizontal-reduce-smin.ll +++ test/CodeGen/X86/horizontal-reduce-smin.ll @@ -472,10 +472,7 @@ ; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX1-NEXT: vzeroupper @@ -487,8 +484,8 @@ ; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 ; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper @@ -552,10 +549,7 @@ ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq @@ -566,8 +560,8 @@ ; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 ; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -577,7 +571,7 @@ ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -643,7 +637,7 @@ ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -699,7 +693,7 @@ ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -711,7 +705,7 @@ ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1287,10 +1281,7 @@ ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq @@ -1303,8 +1294,8 @@ ; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 ; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1316,7 +1307,7 @@ ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1399,9 +1390,9 @@ ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1471,9 +1462,9 @@ ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1487,7 +1478,7 @@ ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq Index: test/CodeGen/X86/horizontal-reduce-umax.ll =================================================================== --- test/CodeGen/X86/horizontal-reduce-umax.ll +++ test/CodeGen/X86/horizontal-reduce-umax.ll @@ -533,15 +533,11 @@ ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; X86-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X86-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2 -; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX1-NEXT: vzeroupper @@ -552,14 +548,15 @@ ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 -; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 -; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 -; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 -; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2 ; X86-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 ; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovapd {{.*#+}} xmm1 = [0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vxorpd %xmm1, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vxorpd %xmm1, %xmm3, %xmm1 +; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper @@ -629,15 +626,11 @@ ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; X64-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X64-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2 -; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq @@ -647,14 +640,15 @@ ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 -; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 -; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 -; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 -; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2 ; X64-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 ; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovapd {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vxorpd %xmm1, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vxorpd %xmm1, %xmm3, %xmm1 +; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -664,7 +658,7 @@ ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -739,7 +733,7 @@ ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -804,7 +798,7 @@ ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -816,7 +810,7 @@ ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1420,15 +1414,11 @@ ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 -; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4 -; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2 -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 -; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 +; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq @@ -1442,14 +1432,15 @@ ; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 -; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 -; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 -; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 ; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 ; X64-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 ; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovapd {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vxorpd %xmm1, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vxorpd %xmm1, %xmm3, %xmm1 +; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1461,7 +1452,7 @@ ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1559,9 +1550,9 @@ ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1646,9 +1637,9 @@ ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1662,7 +1653,7 @@ ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq Index: test/CodeGen/X86/horizontal-reduce-umin.ll =================================================================== --- test/CodeGen/X86/horizontal-reduce-umin.ll +++ test/CodeGen/X86/horizontal-reduce-umin.ll @@ -471,15 +471,11 @@ ; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; X86-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X86-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2 -; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm2 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX1-NEXT: vzeroupper @@ -490,14 +486,15 @@ ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 -; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 -; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 -; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 -; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2 ; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 ; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovapd {{.*#+}} xmm1 = [0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vxorpd %xmm1, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vxorpd %xmm1, %xmm3, %xmm1 +; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper @@ -569,15 +566,11 @@ ; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; X64-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X64-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2 -; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm2 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq @@ -587,14 +580,15 @@ ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 -; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 -; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 -; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 -; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2 ; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 ; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovapd {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vxorpd %xmm1, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vxorpd %xmm1, %xmm3, %xmm1 +; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -604,7 +598,7 @@ ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -679,7 +673,7 @@ ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -744,7 +738,7 @@ ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -756,7 +750,7 @@ ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1324,15 +1318,11 @@ ; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 -; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4 -; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 -; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 +; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq @@ -1346,14 +1336,15 @@ ; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 -; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 -; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 -; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 ; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 ; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 ; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovapd {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vxorpd %xmm1, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vxorpd %xmm1, %xmm3, %xmm1 +; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; X64-AVX2-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1365,7 +1356,7 @@ ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1463,9 +1454,9 @@ ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1550,9 +1541,9 @@ ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1566,7 +1557,7 @@ ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq Index: test/CodeGen/X86/madd.ll =================================================================== --- test/CodeGen/X86/madd.ll +++ test/CodeGen/X86/madd.ll @@ -51,8 +51,7 @@ ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -75,8 +74,8 @@ ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX256-NEXT: vmovd %xmm0, %eax ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq @@ -188,8 +187,7 @@ ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -238,11 +236,10 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -338,8 +335,7 @@ ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -363,8 +359,8 @@ ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX256-NEXT: vmovd %xmm0, %eax ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq @@ -486,8 +482,7 @@ ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -542,11 +537,10 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -666,8 +660,7 @@ ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -718,11 +711,10 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -894,8 +886,7 @@ ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -968,11 +959,10 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -998,11 +988,10 @@ ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq Index: test/CodeGen/X86/required-vector-width.ll =================================================================== --- test/CodeGen/X86/required-vector-width.ll +++ test/CodeGen/X86/required-vector-width.ll @@ -261,11 +261,10 @@ ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -386,11 +385,10 @@ ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq Index: test/CodeGen/X86/sad.ll =================================================================== --- test/CodeGen/X86/sad.ll +++ test/CodeGen/X86/sad.ll @@ -56,8 +56,7 @@ ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -102,11 +101,10 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -318,8 +316,7 @@ ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -368,11 +365,10 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -860,8 +856,7 @@ ; AVX1-NEXT: vpaddd %xmm0, %xmm14, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: addq $24, %rsp ; AVX1-NEXT: vzeroupper @@ -982,11 +977,10 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1011,11 +1005,10 @@ ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq