Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -475,7 +475,8 @@ SDValue reduceBuildVecToShuffle(SDNode *N); SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N, ArrayRef VectorMask, SDValue VecIn1, - SDValue VecIn2, unsigned LeftIdx); + SDValue VecIn2, unsigned LeftIdx, + bool DidSplitVec); SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast); /// Walk up chain skipping non-aliasing memory nodes, @@ -16393,7 +16394,7 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, ArrayRef VectorMask, SDValue VecIn1, SDValue VecIn2, - unsigned LeftIdx) { + unsigned LeftIdx, bool DidSplitVec) { MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); SDValue ZeroIdx = DAG.getConstant(0, DL, IdxTy); @@ -16401,17 +16402,12 @@ EVT InVT1 = VecIn1.getValueType(); EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1; - unsigned Vec2Offset = 0; unsigned NumElems = VT.getVectorNumElements(); unsigned ShuffleNumElems = NumElems; - // In case both the input vectors are extracted from same base - // vector we do not need extra addend (Vec2Offset) while - // computing shuffle mask. - if (!VecIn2 || !(VecIn1.getOpcode() == ISD::EXTRACT_SUBVECTOR) || - !(VecIn2.getOpcode() == ISD::EXTRACT_SUBVECTOR) || - !(VecIn1.getOperand(0) == VecIn2.getOperand(0))) - Vec2Offset = InVT1.getVectorNumElements(); + // If we artificially split a vector in two already, then the offsets in the + // operands will all be based off of VecIn1, even those in VecIn2. + unsigned Vec2Offset = DidSplitVec ? 0 : InVT1.getVectorNumElements(); // We can't generate a shuffle node with mismatched input and output types. // Try to make the types match the type of the output. @@ -16670,6 +16666,7 @@ // vector, then split the vector efficiently based on the maximum // vector access index and adjust the VectorMask and // VecIn accordingly. + bool DidSplitVec = false; if (VecIn.size() == 2) { unsigned MaxIndex = 0; unsigned NearestPow2 = 0; @@ -16700,6 +16697,7 @@ VecIn.pop_back(); VecIn.push_back(VecIn1); VecIn.push_back(VecIn2); + DidSplitVec = true; for (unsigned i = 0; i < NumElems; i++) { if (VectorMask[i] <= 0) @@ -16737,7 +16735,7 @@ (LeftIdx + 1) < VecIn.size() ? VecIn[LeftIdx + 1] : SDValue(); if (SDValue Shuffle = createBuildVecShuffle(DL, N, VectorMask, VecLeft, - VecRight, LeftIdx)) + VecRight, LeftIdx, DidSplitVec)) Shuffles.push_back(Shuffle); else return SDValue(); Index: test/CodeGen/X86/shuffle-extract-subvector.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/shuffle-extract-subvector.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +define void @f(<4 x half>* %a, <4 x half>* %b, <8 x half>* %c) { +; CHECK-LABEL: f: +; CHECK: # %bb.0: +; CHECK-NEXT: movzwl (%rdi), %r8d +; CHECK-NEXT: movzwl 2(%rdi), %r9d +; CHECK-NEXT: movzwl 4(%rdi), %r11d +; CHECK-NEXT: movzwl 6(%rdi), %edi +; CHECK-NEXT: movzwl (%rsi), %r10d +; CHECK-NEXT: movzwl 2(%rsi), %ecx +; CHECK-NEXT: movzwl 4(%rsi), %eax +; CHECK-NEXT: movzwl 6(%rsi), %esi +; CHECK-NEXT: movw %si, 14(%rdx) +; CHECK-NEXT: movw %di, 12(%rdx) +; CHECK-NEXT: movw %ax, 10(%rdx) +; CHECK-NEXT: movw %r11w, 8(%rdx) +; CHECK-NEXT: movw %cx, 6(%rdx) +; CHECK-NEXT: movw %r9w, 4(%rdx) +; CHECK-NEXT: movw %r10w, 2(%rdx) +; CHECK-NEXT: movw %r8w, (%rdx) +; CHECK-NEXT: retq + %tmp4 = load <4 x half>, <4 x half>* %a + %tmp5 = load <4 x half>, <4 x half>* %b + %tmp7 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> + %tmp8 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> + %tmp9 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> + %tmp10 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> + %tmp11 = extractelement <2 x half> %tmp7, i32 0 + %tmp12 = insertelement <8 x half> undef, half %tmp11, i32 0 + %tmp13 = extractelement <2 x half> %tmp7, i32 1 + %tmp14 = insertelement <8 x half> %tmp12, half %tmp13, i32 1 + %tmp15 = extractelement <2 x half> %tmp8, i32 0 + %tmp16 = insertelement <8 x half> %tmp14, half %tmp15, i32 2 + %tmp17 = extractelement <2 x half> %tmp8, i32 1 + %tmp18 = insertelement <8 x half> %tmp16, half %tmp17, i32 3 + %tmp19 = extractelement <2 x half> %tmp9, i32 0 + %tmp20 = insertelement <8 x half> %tmp18, half %tmp19, i32 4 + %tmp21 = extractelement <2 x half> %tmp9, i32 1 + %tmp22 = insertelement <8 x half> %tmp20, half %tmp21, i32 5 + %tmp23 = extractelement <2 x half> %tmp10, i32 0 + %tmp24 = insertelement <8 x half> %tmp22, half %tmp23, i32 6 + %tmp25 = extractelement <2 x half> %tmp10, i32 1 + %tmp26 = insertelement <8 x half> %tmp24, half %tmp25, i32 7 + store <8 x half> %tmp26, <8 x half>* %c + ret void +}