Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -1526,6 +1526,15 @@ Type *Ty) const { return false; } + + /// Return true if EXTRACT_SUBVECTOR is cheap for this result type + /// with this index. This is needed because EXTRACT_SUBVECTOR usually + /// has custom lowering that depends on the index of the first element, + /// and only the target knows which lowering is cheap. + virtual bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const { + return false; + } + //===--------------------------------------------------------------------===// // Runtime Library hooks // Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10783,9 +10783,6 @@ SDValue ExtVal = Extract.getOperand(1); unsigned ExtIndex = cast(ExtVal)->getZExtValue(); if (Extract.getOperand(0) == VecIn1) { - if (ExtIndex > VT.getVectorNumElements()) - return SDValue(); - Mask.push_back(ExtIndex); continue; } @@ -10805,20 +10802,34 @@ if (VecIn2.getNode()) return SDValue(); - // We only support widening of vectors which are half the size of the - // output registers. For example XMM->YMM widening on X86 with AVX. - if (VecIn1.getValueType().getSizeInBits()*2 != VT.getSizeInBits()) - return SDValue(); - // If the input vector type has a different base type to the output // vector type, bail out. if (VecIn1.getValueType().getVectorElementType() != VT.getVectorElementType()) return SDValue(); - // Widen the input vector by adding undef values. - VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, - VecIn1, DAG.getUNDEF(VecIn1.getValueType())); + // If the input vector is too small, widen it. + // We only support widening of vectors which are half the size of the + // output registers. For example XMM->YMM widening on X86 with AVX. + EVT VecInT = VecIn1.getValueType(); + if (VecInT.getSizeInBits() * 2 == VT.getSizeInBits()) { + // Widen the input vector by adding undef values. + VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + VecIn1, DAG.getUNDEF(VecIn1.getValueType())); + } else if (VecInT.getSizeInBits() == VT.getSizeInBits() * 2) { + // If the input vector is too large, try to split it. + if (!TLI.isExtractSubvectorCheap(VT, VT.getVectorNumElements())) + return SDValue(); + + // Try to replace VecIn1 with two extract_subvectors + // No need to update the masks, they should still be correct. + VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1, + DAG.getConstant(VT.getVectorNumElements(), TLI.getVectorIdxTy())); + VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1, + DAG.getConstant(0, TLI.getVectorIdxTy())); + UsesZeroVector = false; + } else + return SDValue(); } if (UsesZeroVector) Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -791,6 +791,10 @@ bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; + /// Return true if EXTRACT_SUBVECTOR is cheap for this result type + /// with this index. + bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override; + /// Intel processors have a unified instruction and data cache const char * getClearCacheBuiltinName() const override { return nullptr; // nothing to do, move along. Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -3846,6 +3846,14 @@ return true; } +bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, + unsigned Index) const { + if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) + return false; + + return (Index == 0 || Index == ResVT.getVectorNumElements()); +} + /// isUndefOrInRange - Return true if Val is undef or if its value falls within /// the specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -1552,6 +1552,28 @@ ret <4 x i32> %2 } +define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) { +; SSE-LABEL: combine_test21: +; SSE: # BB#0: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm2, +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test21: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: movdqa %xmm2, +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> + %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> + store <4 x i32> %1, <4 x i32>* %ptr, align 16 + ret <4 x i32> %2 +} ; Check some negative cases. ; FIXME: Do any of these really make sense? Are they redundant with the above tests?