diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -44625,6 +44625,36 @@ VT, St->getMemOperand(), DAG); } + // Try to fold a extract_element(VTRUNC) pattern into a truncating store. + if (!St->isTruncatingStore() && StoredVal.hasOneUse()) { + auto IsExtractedElement = [](SDValue V) { + if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse()) + V = V.getOperand(0); + unsigned Opc = V.getOpcode(); + if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) { + if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1))) + return V.getOperand(0); + } + return SDValue(); + }; + if (SDValue Extract = IsExtractedElement(StoredVal)) { + SDValue Trunc = peekThroughOneUseBitcasts(Extract.getOperand(0)); + if (Trunc.getOpcode() == X86ISD::VTRUNC) { + SDValue Src = Trunc.getOperand(0); + MVT DstVT = Trunc.getSimpleValueType(); + MVT SrcVT = Src.getSimpleValueType(); + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts; + MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts); + if (NumTruncBits == VT.getSizeInBits() && + TLI.isTruncStoreLegal(SrcVT, TruncVT)) { + return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(), + TruncVT, St->getMemOperand()); + } + } + } + } + // Optimize trunc store (of multiple scalars) to shuffle and store. // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -61,8 +61,7 @@ ; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi) +; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> @@ -159,8 +158,7 @@ ; AVX512VL-LABEL: shuffle_v8i16_to_v4i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovdw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v8i16_to_v4i16: @@ -173,8 +171,7 @@ ; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovdw %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi) +; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %L %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> @@ -338,8 +335,7 @@ ; AVX512VL-LABEL: shuffle_v16i8_to_v4i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, (%rsi) +; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i8_to_v4i8: @@ -352,8 +348,7 @@ ; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovdb %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) +; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> @@ -456,8 +451,7 @@ ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovqw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, (%rsi) +; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16: @@ -470,8 +464,7 @@ ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqw %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) +; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <8 x i16>, <8 x i16>* %L %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> @@ -576,8 +569,7 @@ ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovqb %xmm0, %xmm0 -; AVX512VL-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8: @@ -590,8 +582,7 @@ ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqb %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) ; AVX512BWVL-NEXT: retq %vec = load <16 x i8>, <16 x i8>* %L %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -383,8 +383,7 @@ ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -399,16 +398,14 @@ ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi) +; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8: ; AVX512VBMIVL: # %bb.0: ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VBMIVL-NEXT: vpmovdb %ymm0, (%rsi) ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L @@ -1067,8 +1064,7 @@ ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1083,16 +1079,14 @@ ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi) +; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16: ; AVX512VBMIVL: # %bb.0: ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VBMIVL-NEXT: vpmovqw %ymm0, (%rsi) ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L @@ -1178,8 +1172,7 @@ ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, (%rsi) +; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1194,16 +1187,14 @@ ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) +; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8: ; AVX512VBMIVL: # %bb.0: ; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vmovd %xmm0, (%rsi) +; AVX512VBMIVL-NEXT: vpmovqb %ymm0, (%rsi) ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -233,8 +233,7 @@ ; AVX512-LABEL: shuffle_v64i8_to_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) +; AVX512-NEXT: vpmovqb %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L