Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -1526,6 +1526,15 @@ Type *Ty) const { return false; } + + /// Return true if EXTRACT_SUBVECTOR is cheap for this result type + /// with this index. This is needed because EXTRACT_SUBVECTOR usually + /// has custom lowering that depends on the index of the first element, + /// and only the target knows which lowering is cheap. + virtual bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const { + return false; + } + //===--------------------------------------------------------------------===// // Runtime Library hooks // Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10783,9 +10783,6 @@ SDValue ExtVal = Extract.getOperand(1); unsigned ExtIndex = cast(ExtVal)->getZExtValue(); if (Extract.getOperand(0) == VecIn1) { - if (ExtIndex > VT.getVectorNumElements()) - return SDValue(); - Mask.push_back(ExtIndex); continue; } @@ -10805,20 +10802,34 @@ if (VecIn2.getNode()) return SDValue(); - // We only support widening of vectors which are half the size of the - // output registers. For example XMM->YMM widening on X86 with AVX. - if (VecIn1.getValueType().getSizeInBits()*2 != VT.getSizeInBits()) - return SDValue(); - // If the input vector type has a different base type to the output // vector type, bail out. if (VecIn1.getValueType().getVectorElementType() != VT.getVectorElementType()) return SDValue(); - // Widen the input vector by adding undef values. - VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, - VecIn1, DAG.getUNDEF(VecIn1.getValueType())); + // If the input vector is too small, widen it. + // We only support widening of vectors which are half the size of the + // output registers. For example XMM->YMM widening on X86 with AVX. + EVT VecInT = VecIn1.getValueType(); + if (VecInT.getSizeInBits() * 2 == VT.getSizeInBits()) { + // Widen the input vector by adding undef values. + VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + VecIn1, DAG.getUNDEF(VecIn1.getValueType())); + } else if (VecInT.getSizeInBits() == VT.getSizeInBits() * 2) { + // If the input vector is too large, try to split it. + if (!TLI.isExtractSubvectorCheap(VT, VT.getVectorNumElements())) + return SDValue(); + + // Try to replace VecIn1 with two extract_subvectors + // No need to update the masks, they should still be correct. + VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1, + DAG.getConstant(VT.getVectorNumElements(), TLI.getVectorIdxTy())); + VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1, + DAG.getConstant(0, TLI.getVectorIdxTy())); + UsesZeroVector = false; + } else + return SDValue(); } if (UsesZeroVector) Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -791,6 +791,10 @@ bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; + /// Return true if EXTRACT_SUBVECTOR is cheap for this result type + /// with this index. + bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override; + /// Intel processors have a unified instruction and data cache const char * getClearCacheBuiltinName() const override { return nullptr; // nothing to do, move along. Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -3846,6 +3846,14 @@ return true; } +bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, + unsigned Index) const { + if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) + return false; + + return (Index == 0 || Index == ResVT.getVectorNumElements()); +} + /// isUndefOrInRange - Return true if Val is undef or if its value falls within /// the specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { Index: test/CodeGen/X86/vec_extract-avx.ll =================================================================== --- test/CodeGen/X86/vec_extract-avx.ll +++ test/CodeGen/X86/vec_extract-avx.ll @@ -0,0 +1,82 @@ +target triple = "x86_64-unknown-unknown" + +; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s + +; When extracting multiple consecutive elements from a larger +; vector into a smaller one, do it efficiently. We should use +; an EXTRACT_SUBVECTOR node internally rather than a bunch of +; single element extractions. + +; Extracting the low elements only requires using the right kind of store. +define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) { + %ext0 = extractelement <8 x float> %v, i32 0 + %ext1 = extractelement <8 x float> %v, i32 1 + %ext2 = extractelement <8 x float> %v, i32 2 + %ext3 = extractelement <8 x float> %v, i32 3 + %ins0 = insertelement <4 x float> undef, float %ext0, i32 0 + %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1 + %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2 + %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3 + store <4 x float> %ins3, <4 x float>* %ptr, align 16 + ret void + +; CHECK-LABEL: low_v8f32_to_v4f32 +; CHECK: vmovaps +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} + +; Extracting the high elements requires just one AVX instruction. +define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) { + %ext0 = extractelement <8 x float> %v, i32 4 + %ext1 = extractelement <8 x float> %v, i32 5 + %ext2 = extractelement <8 x float> %v, i32 6 + %ext3 = extractelement <8 x float> %v, i32 7 + %ins0 = insertelement <4 x float> undef, float %ext0, i32 0 + %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1 + %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2 + %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3 + store <4 x float> %ins3, <4 x float>* %ptr, align 16 + ret void + +; CHECK-LABEL: high_v8f32_to_v4f32 +; CHECK: vextractf128 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} + +; Make sure element type doesn't alter the codegen. Note that +; if we were actually using the vector in this function and +; have AVX2, we should generate vextracti128 (the int version). +define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) { + %ext0 = extractelement <8 x i32> %v, i32 4 + %ext1 = extractelement <8 x i32> %v, i32 5 + %ext2 = extractelement <8 x i32> %v, i32 6 + %ext3 = extractelement <8 x i32> %v, i32 7 + %ins0 = insertelement <4 x i32> undef, i32 %ext0, i32 0 + %ins1 = insertelement <4 x i32> %ins0, i32 %ext1, i32 1 + %ins2 = insertelement <4 x i32> %ins1, i32 %ext2, i32 2 + %ins3 = insertelement <4 x i32> %ins2, i32 %ext3, i32 3 + store <4 x i32> %ins3, <4 x i32>* %ptr, align 16 + ret void + +; CHECK-LABEL: high_v8i32_to_v4i32 +; CHECK: vextractf128 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} + +; Make sure that element size doesn't alter the codegen. +define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) { + %ext0 = extractelement <4 x double> %v, i32 2 + %ext1 = extractelement <4 x double> %v, i32 3 + %ins0 = insertelement <2 x double> undef, double %ext0, i32 0 + %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1 + store <2 x double> %ins1, <2 x double>* %ptr, align 16 + ret void + +; CHECK-LABEL: high_v4f64_to_v2f64 +; CHECK: vextractf128 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -1552,6 +1552,37 @@ ret <4 x i32> %2 } +define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) { +; SSE-LABEL: combine_test21: +; SSE: # BB#0: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm2, +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_test21: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: movdqa %xmm2, +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test21: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: movdqa %xmm2, +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> + %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> + store <4 x i32> %1, <4 x i32>* %ptr, align 16 + ret <4 x i32> %2 +} ; Check some negative cases. ; FIXME: Do any of these really make sense? Are they redundant with the above tests?