Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18585,8 +18585,22 @@ if (ConcatSrcNumElts == ExtNumElts) return V.getOperand(ConcatOpIdx); - // TODO: Handle the case where the concat operands are larger than the - // result of this extract by extracting directly from a concat op. + // If the concatenated source vectors are a multiple length of this extract, + // then extract a fraction of one of those source vectors directly from a + // concat operand. Example: + // v2i8 extract_subvec (v16i8 concat (v8i8 X), (v8i8 Y), 14 --> + // v2i8 extract_subvec v8i8 Y, 6 + if (ConcatSrcNumElts % ExtNumElts == 0) { + SDLoc DL(N); + unsigned NewExtIdx = ExtIdx - ConcatOpIdx * ConcatSrcNumElts; + assert(NewExtIdx + ExtNumElts <= ConcatSrcNumElts && + "Trying to extract from >1 concat operand?"); + assert(NewExtIdx % ExtNumElts == 0 && + "Extract index is not a multiple of the input vector length."); + SDValue NewIndexC = DAG.getIntPtrConstant(NewExtIdx, DL); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, + V.getOperand(ConcatOpIdx), NewIndexC); + } } V = peekThroughBitcasts(V); Index: llvm/test/CodeGen/X86/avg.ll =================================================================== --- llvm/test/CodeGen/X86/avg.ll +++ llvm/test/CodeGen/X86/avg.ll @@ -462,14 +462,12 @@ ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 +; AVX512BW-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2 ; AVX512BW-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpavgb 32(%rsi), %xmm2, %xmm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, (%rax) -; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqu %xmm1, (%rax) +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) +; AVX512BW-NEXT: vmovdqu %xmm2, (%rax) ; AVX512BW-NEXT: retq %1 = load <48 x i8>, <48 x i8>* %a %2 = load <48 x i8>, <48 x i8>* %b Index: llvm/test/CodeGen/X86/pr34657.ll =================================================================== --- llvm/test/CodeGen/X86/pr34657.ll +++ llvm/test/CodeGen/X86/pr34657.ll @@ -5,13 +5,12 @@ ; CHECK-LABEL: pr34657: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: vmovups 64(%rsi), %ymm0 -; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm1 -; CHECK-NEXT: vmovups (%rsi), %zmm2 -; CHECK-NEXT: vmovaps %ymm0, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm2, (%rdi) -; CHECK-NEXT: vextractf32x4 $2, %zmm1, 96(%rdi) +; CHECK-NEXT: vmovups (%rsi), %zmm0 +; CHECK-NEXT: vmovups 64(%rsi), %ymm1 +; CHECK-NEXT: vmovups 96(%rsi), %xmm2 +; CHECK-NEXT: vmovaps %xmm2, 96(%rdi) +; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) +; CHECK-NEXT: vmovaps %zmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: Index: llvm/test/CodeGen/X86/x86-interleaved-access.ll =================================================================== --- llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -1055,64 +1055,24 @@ } define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <48 x i8>* %p) { -; AVX1-LABEL: interleaved_store_vf16_i8_stride3: -; AVX1: # %bb.0: -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqu %xmm0, 16(%rdi) -; AVX1-NEXT: vmovdqu %xmm1, (%rdi) -; AVX1-NEXT: vmovdqu %xmm2, 32(%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: interleaved_store_vf16_i8_stride3: -; AVX2: # %bb.0: -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqu %xmm0, 16(%rdi) -; AVX2-NEXT: vmovdqu %xmm1, (%rdi) -; AVX2-NEXT: vmovdqu %xmm2, 32(%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: interleaved_store_vf16_i8_stride3: -; AVX512: # %bb.0: -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu %ymm0, (%rdi) -; AVX512-NEXT: vextracti32x4 $2, %zmm1, 32(%rdi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: interleaved_store_vf16_i8_stride3: +; AVX: # %bb.0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vmovdqu %xmm0, 16(%rdi) +; AVX-NEXT: vmovdqu %xmm1, (%rdi) +; AVX-NEXT: vmovdqu %xmm2, 32(%rdi) +; AVX-NEXT: retq %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> %2 = shufflevector <16 x i8> %c, <16 x i8> undef, <32 x i32> %interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> %2, <48 x i32>