Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6112,31 +6112,45 @@ } // fold (zext (truncate x)) -> (and x, mask) - if (N0.getOpcode() == ISD::TRUNCATE && - (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT))) { - + if (N0.getOpcode() == ISD::TRUNCATE) { // fold (zext (truncate (load x))) -> (zext (smaller load x)) // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n))) if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { - SDNode* oye = N0.getNode()->getOperand(0).getNode(); + SDNode *oye = N0.getNode()->getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); // CombineTo deleted the truncate, if needed, but not what's under it. AddToWorklist(oye); } - return SDValue(N, 0); // Return N so it doesn't get rechecked! + return SDValue(N, 0); // Return N so it doesn't get rechecked! } - SDValue Op = N0.getOperand(0); - if (Op.getValueType().bitsLT(VT)) { - Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op); - AddToWorklist(Op.getNode()); - } else if (Op.getValueType().bitsGT(VT)) { - Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); - AddToWorklist(Op.getNode()); + EVT SrcVT = N0.getOperand(0).getValueType(); + EVT MinVT = N0.getValueType(); + + // Try to mask before the extension to avoid having to generate a larger mask, + // possibly over several sub-vectors. + if (SrcVT.bitsLT(VT)) { + if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) && + TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) { + SDValue Op = N0.getOperand(0); + Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); + AddToWorklist(Op.getNode()); + return DAG.getZExtOrTrunc(Op, SDLoc(N), VT); + } + } + + if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) { + SDValue Op = N0.getOperand(0); + if (SrcVT.bitsLT(VT)) { + Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op); + AddToWorklist(Op.getNode()); + } else if (SrcVT.bitsGT(VT)) { + Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); + AddToWorklist(Op.getNode()); + } + return DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); } - return DAG.getZeroExtendInReg(Op, SDLoc(N), - N0.getValueType().getScalarType()); } // Fold (zext (and (trunc x), cst)) -> (and x, cst), Index: test/CodeGen/SystemZ/insert-05.ll =================================================================== --- test/CodeGen/SystemZ/insert-05.ll +++ test/CodeGen/SystemZ/insert-05.ll @@ -214,8 +214,8 @@ ; The truncation here isn't free; we need an explicit zero extension. define i64 @f19(i32 %a) { ; CHECK-LABEL: f19: -; CHECK: llgcr %r2, %r2 -; CHECK: oihl %r2, 1 +; CHECK: llcr %r2, %r2 +; CHECK: iihf %r2, 1 ; CHECK: br %r14 %trunc = trunc i32 %a to i8 %ext = zext i8 %trunc to i64 Index: test/CodeGen/SystemZ/int-conv-04.ll =================================================================== --- test/CodeGen/SystemZ/int-conv-04.ll +++ test/CodeGen/SystemZ/int-conv-04.ll @@ -5,7 +5,7 @@ ; Test register extension, starting with an i32. define i64 @f1(i32 %a) { ; CHECK-LABEL: f1: -; CHECK: llgcr %r2, %r2 +; CHECK: risbg %r2, %r2, 56, 191, 0 ; CHECK: br %r14 %byte = trunc i32 %a to i8 %ext = zext i8 %byte to i64 Index: test/CodeGen/SystemZ/int-conv-08.ll =================================================================== --- test/CodeGen/SystemZ/int-conv-08.ll +++ test/CodeGen/SystemZ/int-conv-08.ll @@ -5,7 +5,7 @@ ; Test register extension, starting with an i32. define i64 @f1(i32 %a) { ; CHECK-LABEL: f1: -; CHECK: llghr %r2, %r2 +; CHECK: risbg %r2, %r2, 48, 191, 0 ; CHECK: br %r14 %half = trunc i32 %a to i16 %ext = zext i16 %half to i64 Index: test/CodeGen/X86/avx2-conversions.ll =================================================================== --- test/CodeGen/X86/avx2-conversions.ll +++ test/CodeGen/X86/avx2-conversions.ll @@ -61,8 +61,8 @@ define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind { ; CHECK-LABEL: zext_8i8_8i32: ; CHECK: ## BB#0: +; CHECK-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: retq %B = zext <8 x i8> %A to <8 x i32> ret <8 x i32>%B Index: test/CodeGen/X86/vec_cast2.ll =================================================================== --- test/CodeGen/X86/vec_cast2.ll +++ test/CodeGen/X86/vec_cast2.ll @@ -46,11 +46,10 @@ define <8 x float> @foo2_8(<8 x i8> %src) { ; CHECK-LABEL: foo2_8: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpand LCPI2_0, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: retl @@ -98,25 +97,25 @@ ; CHECK-WIDE: ## BB#0: ; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax ; CHECK-WIDE-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 -; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3] +; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax ; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0] +; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax ; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm2 ## xmm2 = xmm0[3,1,2,3] +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] ; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax ; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax ; CHECK-WIDE-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3] +; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax ; CHECK-WIDE-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0] +; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax ; CHECK-WIDE-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3] +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax ; CHECK-WIDE-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 ; CHECK-WIDE-NEXT: vzeroupper @@ -135,13 +134,13 @@ ; CHECK-WIDE: ## BB#0: ; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax ; CHECK-WIDE-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 -; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3] +; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax ; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0] +; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax ; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3] +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax ; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0 ; CHECK-WIDE-NEXT: retl Index: test/CodeGen/X86/vector-zext.ll =================================================================== --- test/CodeGen/X86/vector-zext.ll +++ test/CodeGen/X86/vector-zext.ll @@ -910,50 +910,46 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) { ; SSE2-LABEL: zext_8i8_to_8i32: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_8i8_to_8i32: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_8i8_to_8i32: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_8i8_to_8i32: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_8i8_to_8i32: ; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq entry: %t = zext <8 x i8> %z to <8 x i32>