Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1691,6 +1691,163 @@ verifyIntrinsicTables(); } +typedef enum : unsigned { MMX = 0, XMM = 1, YMM = 3, ZMM = 7 } VecRegKind; +enum : unsigned { UNDEF, FRWD, BKWD }; + +static inline int GetLaneIndex(int Bits, int StartIdx = 0) { + return (((Bits + 64) >> 6) - 1) + StartIdx; +} + +/// Find the smallest sub-register which accommodates all the non-undefs +/// of a node, Lane granularity is taken as 64 bit (MMX). Lookup is +/// performed in both forward and backward directions. +static bool GetMinimalUsedSubReg(SmallBitVector &Lanes, int &ForwardSubReg, + VecRegKind &SubRegKind) { + if (Lanes.all() || Lanes.none()) { + ForwardSubReg = UNDEF; + SubRegKind = ZMM; + return false; + } + + auto GetSubReg = [&](bool ScanForward) -> VecRegKind { + VecRegKind SubReg = ZMM; + VecRegKind VecSubRegs[4] = {MMX, XMM, YMM, ZMM}; + for (int i = 0; i < 4; i++) { + int Checker = ScanForward ? Lanes.find_next(VecSubRegs[i]) + : Lanes.find_prev(ZMM - VecSubRegs[i]); + if (Checker == -1) { + SubReg = VecSubRegs[i]; + break; + } + } + return SubReg; + }; + + VecRegKind FrwdSubReg = GetSubReg(true); + VecRegKind BkwdSubReg = GetSubReg(false); + if (FrwdSubReg < BkwdSubReg) { + ForwardSubReg = FRWD; + SubRegKind = FrwdSubReg; + return true; + } else if (BkwdSubReg < FrwdSubReg) { + ForwardSubReg = BKWD; + SubRegKind = BkwdSubReg; + return true; + } + return false; +} + +// Granularity of the lane considered is 64 bit, mark a bit in the +// bitvector if corresponding lane is accessed. +static bool MarkUsedLanes(SDNode *N, SmallBitVector &Lanes, int StartIdx) { + bool retVal = false; + + switch (N->getOpcode()) { + default: { + int VTSz = N->getValueType(0).getSizeInBits(); + for (int i = StartIdx, e = GetLaneIndex(VTSz, StartIdx); i < e; i++) + Lanes[i] = 1; + } break; + case ISD::CONCAT_VECTORS: { + int SZInBits = 0; + for (auto &Oprnd : N->op_values()) { + if (!Oprnd.isUndef()) + retVal |= MarkUsedLanes(Oprnd.getNode(), Lanes, StartIdx); + SZInBits += Oprnd.getValueType().getSizeInBits(); + StartIdx = GetLaneIndex(SZInBits); + } + } break; + case ISD::VECTOR_SHUFFLE: { + ShuffleVectorSDNode *SV = dyn_cast(N); + EVT ElemTy = SV->getOperand(0).getValueType().getVectorElementType(); + int OperNumElems = SV->getOperand(0).getValueType().getVectorNumElements(); + int ElemSz = ElemTy.getSizeInBits(); + + bool OpersUndef[2] = {SV->getOperand(0).isUndef(), + SV->getOperand(1).isUndef()}; + + ArrayRef Mask = SV->getMask(); + for (int i = 0, e = Mask.size(); i < e; i++) { + if (Mask[i] >= 0 && !OpersUndef[Mask[i] >= OperNumElems]) + Lanes[GetLaneIndex(i * ElemSz, StartIdx)] = 1; + } + } break; + } + + if (Lanes.all()) + return true; + + return retVal; +} + +// A generic routine which checks if operands of a binary operation +// can be scaled down to a lower sub-register, also it sets the +// StartIdx (from where operand's extraction needs to start) +// NewOperVT (new value type of result) and PadVT (value type of +// padding for result). +static bool TryDownScalingBinaryOperands(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + uint64_t &StartIdx, EVT &NewOperVT, + EVT &PadVT) { + SDLoc DL(N); + VecRegKind Op0SubReg, Op1SubReg; + int Op0FrwdSubReg, Op1FrwdSubReg; + + assert(N->getNumOperands() == 2 && "Not a binary operation"); + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + EVT OperVT = Op0.getValueType(); + int LaneSz = GetLaneIndex(OperVT.getSizeInBits()); + + SmallBitVector Op0Lanes(LaneSz, 0); + SmallBitVector Op1Lanes(LaneSz, 0); + + if (!OperVT.isSimple() || OperVT.getSizeInBits() < 64) + return false; + + EVT OperElemVT = OperVT.getVectorElementType(); + int OperNumElems = OperVT.getVectorNumElements(); + int OperElemSZ = OperElemVT.getSizeInBits(); + + // Mark bit corresponding to 64 bit lane if the particular + // lane is accessed by the node. + bool Op0FullUse = MarkUsedLanes(Op0.getNode(), Op0Lanes, 0); + bool Op1FullUse = MarkUsedLanes(Op1.getNode(), Op1Lanes, 0); + if (Op0FullUse && Op1FullUse) + return false; + + // Find the smallest sub-register which can accommodate + // non-undef part of operands. + if (!GetMinimalUsedSubReg(Op0Lanes, Op0FrwdSubReg, Op0SubReg) && + !GetMinimalUsedSubReg(Op1Lanes, Op1FrwdSubReg, Op1SubReg)) + return false; + + int OperSubReg = std::min(Op0SubReg, Op1SubReg); + int PadNumElems = + PowerOf2Floor(OperNumElems - ((OperSubReg + 1) * 64) / OperElemSZ); + int NewOperNumElems = OperNumElems - PadNumElems; + + if ((Op0FrwdSubReg && Op1FrwdSubReg && Op0FrwdSubReg != Op1FrwdSubReg) || + NewOperNumElems >= OperNumElems) + return false; + + // Legal direction for one of the operand could be UNDEF + // hence both operand direction are OR'ed to ascertain + // the actual direction of subreg. + int FrwdSubReg = Op0FrwdSubReg | Op1FrwdSubReg; + NewOperVT = EVT::getVectorVT(*DAG.getContext(), OperElemVT, NewOperNumElems); + PadVT = EVT::getVectorVT(*DAG.getContext(), OperElemVT, PadNumElems); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(NewOperVT) || !TLI.isTypeLegal(PadVT)) + return false; + + StartIdx = FrwdSubReg ? 0 : OperNumElems - NewOperNumElems; + return true; +} + // This has so far only been implemented for 64-bit MachO. bool X86TargetLowering::useLoadStackGuardNode() const { return Subtarget.isTargetMachO() && Subtarget.is64Bit(); @@ -29065,6 +29222,7 @@ return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask); } + static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -33475,7 +33633,8 @@ /// set to A, RHS to B, and the routine returns 'true'. /// Note that the binary operation should have the property that if one of the /// operands is UNDEF then the result is UNDEF. -static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { +static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative, + bool AllowAVX512VT = false) { // Look for the following pattern: if // A = < float a0, float a1, float a2, float a3 > // B = < float b0, float b1, float b2, float b3 > @@ -33492,8 +33651,8 @@ MVT VT = LHS.getSimpleValueType(); - assert((VT.is128BitVector() || VT.is256BitVector()) && - "Unsupported vector type for horizontal add/sub"); + assert((AllowAVX512VT || VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for horizontal add/sub"); // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to // operate independently on 128-bit lanes. @@ -33586,6 +33745,40 @@ return true; } +static SDValue TryGenHorizontalAddSub(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + bool IsCommutative, int Opcode) { + SDLoc DL(N); + uint64_t StartIdx; + EVT PadVT, NewOperVT; + + EVT VT = N->getValueType(0); + assert((VT == MVT::v16i32 || VT == MVT::v16f32) && + "Unexpected DAG node type"); + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + if (isHorizontalBinOp(Op0, Op1, IsCommutative, true) && + (TryDownScalingBinaryOperands(N, DAG, Subtarget, StartIdx, NewOperVT, + PadVT))) { + SDValue NewOp0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewOperVT, Op0, + DAG.getIntPtrConstant(StartIdx, DL)); + SDValue NewOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewOperVT, Op1, + DAG.getIntPtrConstant(StartIdx, DL)); + SDValue NewN = DAG.getNode(Opcode, SDLoc(N), NewOperVT, NewOp0, NewOp1); + SDValue ConcatOps[2] = {DAG.getUNDEF(PadVT), NewN}; + if (StartIdx == 0) { + ConcatOps[0] = NewN; + ConcatOps[1] = DAG.getUNDEF(PadVT); + } + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); + } + + return SDValue(); +} + + /// Do target-specific dag combines on floating-point adds/subs. static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -33595,13 +33788,20 @@ bool IsFadd = N->getOpcode() == ISD::FADD; assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode"); + auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB; + // Try to synthesize horizontal add/sub from adds/subs of shuffles. if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, IsFadd)) { - auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB; return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS); } + + SDValue V; + if ((Subtarget.hasFp256() && (VT == MVT::v16f32)) && + (V = TryGenHorizontalAddSub(N, DAG, Subtarget, IsFadd, NewOpcode))) + return V; + return SDValue(); } @@ -35371,8 +35571,11 @@ return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec); } + static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + SDLoc DL(N); + const SDNodeFlags Flags = N->getFlags(); if (Flags.hasVectorReduction()) { if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget)) @@ -35390,10 +35593,18 @@ isHorizontalBinOp(Op0, Op1, true)) return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); + SDValue V; + if ((Subtarget.hasInt256() && VT == MVT::v16i32) && + (V = TryGenHorizontalAddSub(N, DAG, Subtarget, true, X86ISD::HADD))) + return V; + if (SDValue V = combineIncDecVector(N, DAG)) return V; - return combineAddOrSubToADCOrSBB(N, DAG); + if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG)) + return V; + + return SDValue(); } static SDValue combineSub(SDNode *N, SelectionDAG &DAG, @@ -35426,6 +35637,11 @@ isHorizontalBinOp(Op0, Op1, false)) return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); + SDValue V; + if ((Subtarget.hasInt256() && VT == MVT::v16i32) && + (V = TryGenHorizontalAddSub(N, DAG, Subtarget, false, X86ISD::HSUB))) + return V; + if (SDValue V = combineIncDecVector(N, DAG)) return V; Index: test/CodeGen/X86/avx512-hadd-hsub.ll =================================================================== --- test/CodeGen/X86/avx512-hadd-hsub.ll +++ test/CodeGen/X86/avx512-hadd-hsub.ll @@ -7,8 +7,7 @@ ; KNL: # BB#0: ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; KNL-NEXT: vmovd %xmm0, %eax ; KNL-NEXT: retq ; @@ -16,8 +15,7 @@ ; SKX: # BB#0: ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; SKX-NEXT: vmovd %xmm0, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -34,8 +32,7 @@ ; KNL: # BB#0: ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; KNL-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vphsubd %ymm0, %ymm0, %ymm0 ; KNL-NEXT: vmovd %xmm0, %eax ; KNL-NEXT: retq ; @@ -43,8 +40,7 @@ ; SKX: # BB#0: ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SKX-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vphsubd %ymm0, %ymm0, %ymm0 ; SKX-NEXT: vmovd %xmm0, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -61,8 +57,7 @@ ; KNL: # BB#0: ; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vhaddps %ymm0, %ymm0, %ymm0 ; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; KNL-NEXT: retq ; @@ -70,8 +65,7 @@ ; SKX: # BB#0: ; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 ; SKX-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -88,8 +82,7 @@ ; KNL: # BB#0: ; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; KNL-NEXT: vsubps %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vhsubps %ymm0, %ymm0, %ymm0 ; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; KNL-NEXT: retq ; @@ -97,8 +90,7 @@ ; SKX: # BB#0: ; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SKX-NEXT: vsubps %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vhsubps %ymm0, %ymm0, %ymm0 ; SKX-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -117,16 +109,12 @@ ; CHECK-NEXT: retq ; KNL-LABEL: hadd_16_3: ; KNL: # BB#0: -; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; KNL-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: hadd_16_3: ; SKX: # BB#0: -; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; SKX-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ; SKX-NEXT: retq %x226 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32> @@ -144,16 +132,12 @@ ; CHECK-NEXT: retq ; KNL-LABEL: fhadd_16_3: ; KNL: # BB#0: -; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; KNL-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fhadd_16_3: ; SKX: # BB#0: -; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; SKX-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; SKX-NEXT: retq %x226 = shufflevector <16 x float> %x225, <16 x float> %x227, <16 x i32> @@ -174,16 +158,12 @@ ; CHECK-NEXT: retq ; KNL-LABEL: fhadd_16_4: ; KNL: # BB#0: -; KNL-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; KNL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fhadd_16_4: ; SKX: # BB#0: -; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ; SKX-NEXT: retq %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> Index: test/CodeGen/X86/madd.ll =================================================================== --- test/CodeGen/X86/madd.ll +++ test/CodeGen/X86/madd.ll @@ -329,8 +329,7 @@ ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq Index: test/CodeGen/X86/sad.ll =================================================================== --- test/CodeGen/X86/sad.ll +++ test/CodeGen/X86/sad.ll @@ -78,8 +78,7 @@ ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -104,8 +103,7 @@ ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -327,8 +325,7 @@ ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -355,8 +352,7 @@ ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -800,8 +796,7 @@ ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -829,8 +824,7 @@ ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vphaddd %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq Index: test/CodeGen/X86/shuffle-vector-same-inputs.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/shuffle-vector-same-inputs.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL + +define <16 x i8> @foo(<64 x i8> %x) { +; AVX512F-LABEL: foo: +; AVX512F: # BB#0: +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: foo: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: foo: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax +; AVX512BW-NEXT: vpextrb $1, %xmm0, %ecx +; AVX512BW-NEXT: vmovd %ecx, %xmm1 +; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax +; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $13, %xmm0, %eax +; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $1, %xmm0, %eax +; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax +; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax +; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $14, %xmm0, %eax +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: foo: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vpextrb $5, %xmm0, %eax +; AVX512BWVL-NEXT: vpextrb $1, %xmm0, %ecx +; AVX512BWVL-NEXT: vmovd %ecx, %xmm1 +; AVX512BWVL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $9, %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $13, %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; AVX512BWVL-NEXT: vpextrb $1, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $5, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $9, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $13, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX512BWVL-NEXT: vpextrb $1, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $5, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $9, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $13, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512BWVL-NEXT: vpextrb $1, %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $5, %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $9, %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $14, %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> + ret <16 x i8> %res +}