diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10606,22 +10606,26 @@ // fold (aext (load x)) -> (aext (truncate (extload x))) // None of the supported targets knows how to perform load and any_ext - // on vectors in one instruction. We only perform this transformation on - // scalars. - if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() && - ISD::isUNINDEXEDLoad(N0.getNode()) && - TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { + // on vectors in one instruction, so attempt to fold to zext instead. + if (VT.isVector()) { + // Try to simplify (zext (load x)). + if (SDValue foldedExt = + tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0, + ISD::ZEXTLOAD, ISD::ZERO_EXTEND)) + return foldedExt; + } else if (ISD::isNON_EXTLoad(N0.getNode()) && + ISD::isUNINDEXEDLoad(N0.getNode()) && + TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { bool DoXform = true; - SmallVector SetCCs; + SmallVector SetCCs; if (!N0.hasOneUse()) - DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs, - TLI); + DoXform = + ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs, TLI); if (DoXform) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, - LN0->getChain(), - LN0->getBasePtr(), N0.getValueType(), - LN0->getMemOperand()); + LN0->getChain(), LN0->getBasePtr(), + N0.getValueType(), LN0->getMemOperand()); ExtendSetCCUses(SetCCs, N0, ExtLoad, ISD::ANY_EXTEND); // If the load value is used only by N, replace it via CombineTo N. bool NoReplaceTrunc = N0.hasOneUse(); @@ -10630,8 +10634,8 @@ DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); recursivelyDeleteUnusedNodes(LN0); } else { - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), - N0.getValueType(), ExtLoad); + SDValue Trunc = + DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); CombineTo(LN0, Trunc, ExtLoad.getValue(1)); } return SDValue(N, 0); // Return N so it doesn't get rechecked! diff --git a/llvm/test/CodeGen/X86/avx512-any_extend_load.ll b/llvm/test/CodeGen/X86/avx512-any_extend_load.ll --- a/llvm/test/CodeGen/X86/avx512-any_extend_load.ll +++ b/llvm/test/CodeGen/X86/avx512-any_extend_load.ll @@ -4,25 +4,13 @@ define void @any_extend_load_v8i64(<8 x i8> * %ptr) { -; KNL-LABEL: any_extend_load_v8i64: -; KNL: # %bb.0: -; KNL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; KNL-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; KNL-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; KNL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; KNL-NEXT: vpmovqb %zmm0, (%rdi) -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: any_extend_load_v8i64: -; SKX: # %bb.0: -; SKX-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero -; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; SKX-NEXT: vpmovqb %zmm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; ALL-LABEL: any_extend_load_v8i64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero +; ALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; ALL-NEXT: vpmovqb %zmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq %wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1 %1 = zext <8 x i8> %wide.load to <8 x i64> %2 = add nuw nsw <8 x i64> %1, diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -4368,9 +4368,8 @@ define void @test_zext_v8i8_to_v8i16(<8 x i8>* %arg, <8 x i16>* %arg1) { ; CHECK-LABEL: test_zext_v8i8_to_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovq (%rdi), %xmm0 # xmm0 = mem[0],zero -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; CHECK-NEXT: vpmovzxbw (%rdi), %xmm0 # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, (%rsi) ; CHECK-NEXT: retq %tmp = getelementptr <8 x i8>, <8 x i8>* %arg, i32 0 diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -1904,8 +1904,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X86-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; X86-NEXT: pmuludq %xmm1, %xmm0 ; X86-NEXT: retl ;