Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10780,9 +10780,6 @@ SDValue ExtVal = Extract.getOperand(1); unsigned ExtIndex = cast(ExtVal)->getZExtValue(); if (Extract.getOperand(0) == VecIn1) { - if (ExtIndex > VT.getVectorNumElements()) - return SDValue(); - Mask.push_back(ExtIndex); continue; } @@ -10797,25 +10794,52 @@ // We can't generate a shuffle node with mismatched input and output types. // Attempt to transform a single input vector to the correct type. - if ((VT != VecIn1.getValueType())) { - // We don't support shuffeling between TWO values of different types. + EVT VecIn1VT = VecIn1.getValueType(); + if ((VT != VecIn1VT)) { + // We don't support shuffling between TWO values of different types. if (VecIn2.getNode()) return SDValue(); + + // If the input vector type has a different base type to the output + // vector type, bail out. + if (VecIn1VT.getVectorElementType() != VT.getVectorElementType()) + return SDValue(); + // See if this is a vector extraction from a larger vector. + // Ignore single element build vectors (just 1 mask element) because + // that's better handled as a scalar, not a vector. + + // TODO: We should be able to allow an aribtrarily larger source vector to + // be extracted into a smaller vector, but this may cause silently wrong + // codegen in the x86 backend at least. For now, limit the transform to + // a simple upper/lower half-size extraction. + if (VecIn1VT.getSizeInBits() == (VT.getSizeInBits() * 2) && + Mask.size() > 1) { + int StartIdx = Mask[0]; + bool IsExtract = true; + // The mask must specify consecutive elements from the source vector. + for (int i = 0, e = Mask.size(); i < e; i++) { + if (Mask[i] != (i + StartIdx)) { + IsExtract = false; + break; + } + } + if (IsExtract) + // TODO: See comment above; we should be able to remove this check. + if (StartIdx == 0 || StartIdx == (signed) VT.getVectorNumElements()) { + SDValue VecIdx = DAG.getIntPtrConstant(StartIdx); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1, VecIdx); + } + } + // We only support widening of vectors which are half the size of the // output registers. For example XMM->YMM widening on X86 with AVX. - if (VecIn1.getValueType().getSizeInBits()*2 != VT.getSizeInBits()) - return SDValue(); - - // If the input vector type has a different base type to the output - // vector type, bail out. - if (VecIn1.getValueType().getVectorElementType() != - VT.getVectorElementType()) + if (VecIn1VT.getSizeInBits() * 2 != VT.getSizeInBits()) return SDValue(); // Widen the input vector by adding undef values. VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, - VecIn1, DAG.getUNDEF(VecIn1.getValueType())); + VecIn1, DAG.getUNDEF(VecIn1VT)); } if (UsesZeroVector) Index: test/CodeGen/X86/vec_extract-avx.ll =================================================================== --- test/CodeGen/X86/vec_extract-avx.ll +++ test/CodeGen/X86/vec_extract-avx.ll @@ -0,0 +1,81 @@ +; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s + +; When extracting multiple consecutive elements from a larger +; vector into a smaller one, do it efficiently. We should use +; an EXTRACT_SUBVECTOR node internally rather than a bunch of +; single element extractions. + +; Extracting the low elements only requires using the right kind of store. +define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) { + %ext0 = extractelement <8 x float> %v, i32 0 + %ext1 = extractelement <8 x float> %v, i32 1 + %ext2 = extractelement <8 x float> %v, i32 2 + %ext3 = extractelement <8 x float> %v, i32 3 + %ins0 = insertelement <4 x float> undef, float %ext0, i32 0 + %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1 + %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2 + %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3 + store <4 x float> %ins3, <4 x float>* %ptr, align 16 + ret void + +; CHECK-LABEL: low_v8f32_to_v4f32 +; CHECK: vmovaps +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} + +; Extracting the high elements requires just one AVX instruction. +define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) { + %ext0 = extractelement <8 x float> %v, i32 4 + %ext1 = extractelement <8 x float> %v, i32 5 + %ext2 = extractelement <8 x float> %v, i32 6 + %ext3 = extractelement <8 x float> %v, i32 7 + %ins0 = insertelement <4 x float> undef, float %ext0, i32 0 + %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1 + %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2 + %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3 + store <4 x float> %ins3, <4 x float>* %ptr, align 16 + ret void + +; CHECK-LABEL: high_v8f32_to_v4f32 +; CHECK: vextractf128 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} + +; Make sure element type doesn't alter the codegen. Note that +; if we were actually using the vector in this function and +; have AVX2, we should generate vextracti128 (the int version). +define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) { + %ext0 = extractelement <8 x i32> %v, i32 4 + %ext1 = extractelement <8 x i32> %v, i32 5 + %ext2 = extractelement <8 x i32> %v, i32 6 + %ext3 = extractelement <8 x i32> %v, i32 7 + %ins0 = insertelement <4 x i32> undef, i32 %ext0, i32 0 + %ins1 = insertelement <4 x i32> %ins0, i32 %ext1, i32 1 + %ins2 = insertelement <4 x i32> %ins1, i32 %ext2, i32 2 + %ins3 = insertelement <4 x i32> %ins2, i32 %ext3, i32 3 + store <4 x i32> %ins3, <4 x i32>* %ptr, align 16 + ret void + +; CHECK-LABEL: high_v8i32_to_v4i32 +; CHECK: vextractf128 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} + +; Make sure that element size doesn't alter the codegen. +define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) { + %ext0 = extractelement <4 x double> %v, i32 2 + %ext1 = extractelement <4 x double> %v, i32 3 + %ins0 = insertelement <2 x double> undef, double %ext0, i32 0 + %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1 + store <2 x double> %ins1, <2 x double>* %ptr, align 16 + ret void + +; CHECK-LABEL: high_v4f64_to_v2f64 +; CHECK: vextractf128 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} +