Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -24903,24 +24903,118 @@ } } +static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); + + // A vector zext_in_reg may be represented as a shuffle, + // feeding into a bitcast (this represents anyext) feeding into + // an and with a mask. + // We'd like to try to combine that into a shuffle with zero + // plus a bitcast, removing the and. + if (N0.getOpcode() != ISD::BITCAST || + N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + + // The other side of the AND should be a splat of 2^C, where C + // is the number of bits in the source type. + if (N1.getOpcode() == ISD::BITCAST) + N1 = N1.getOperand(0); + if (N1.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + BuildVectorSDNode *Vector = cast(N1); + + ShuffleVectorSDNode *Shuffle = cast(N0.getOperand(0)); + EVT SrcType = Shuffle->getValueType(0); + + // We expect a single-source shuffle + if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF) + return SDValue(); + + unsigned SrcSize = SrcType.getScalarSizeInBits(); + + APInt SplatValue, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (!Vector->isConstantSplat(SplatValue, SplatUndef, + SplatBitSize, HasAnyUndefs)) + return SDValue(); + + unsigned ResSize = N1.getValueType().getScalarSizeInBits(); + // Make sure the splat matches the mask we expect + if (SplatBitSize > ResSize || + (SplatValue + 1).exactLogBase2() != SrcSize) + return SDValue(); + + // Make sure the input and output size make sense + if (SrcSize >= ResSize || ResSize % SrcSize) + return SDValue(); + + // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...> + // The number of u's between each two values depends on the ratio between + // the source and dest type. + unsigned ZextRatio = ResSize / SrcSize; + bool IsZext = true; + for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) { + if (i % ZextRatio) { + if (Shuffle->getMaskElt(i) > 0) { + // Expected undef + IsZext = false; + break; + } + } else { + if (Shuffle->getMaskElt(i) != (i / ZextRatio)) { + // Expected element number + IsZext = false; + break; + } + } + } + + if (!IsZext) + return SDValue(); + + // Ok, perform the transformation - replace the shuffle with + // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero + // (instead of undef) where the k elements come from the zero vector. + SmallVector Mask; + unsigned NumElems = SrcType.getVectorNumElements(); + for (unsigned i = 0; i < NumElems; ++i) + if (i % ZextRatio) + Mask.push_back(NumElems); + else + Mask.push_back(i / ZextRatio); + + SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL, + Shuffle->getOperand(0), DAG.getConstant(0, SrcType), Mask); + return DAG.getNode(ISD::BITCAST, DL, N0.getValueType(), NewShuffle); +} + static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - EVT VT = N->getValueType(0); if (DCI.isBeforeLegalizeOps()) return SDValue(); + SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget); + if (Zext.getNode()) + return Zext; + SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); if (R.getNode()) return R; + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); + // Create BEXTR instructions // BEXTR is ((X >> imm) & (2**size-1)) if (VT == MVT::i32 || VT == MVT::i64) { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDLoc DL(N); - // Check for BEXTR. if ((Subtarget->hasBMI() || Subtarget->hasTBM()) && (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) { @@ -24948,10 +25042,6 @@ if (VT != MVT::v2i64 && VT != MVT::v4i64) return SDValue(); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDLoc DL(N); - // Check LHS for vnot if (N0.getOpcode() == ISD::XOR && //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) Index: llvm/trunk/test/CodeGen/X86/vector-zext.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-zext.ll +++ llvm/trunk/test/CodeGen/X86/vector-zext.ll @@ -7,34 +7,30 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: zext_8i16_to_8i32: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: # kill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pand .LCPI0_0(%rip), %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_8i16_to_8i32: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: # kill +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pand .LCPI0_0(%rip), %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_8i16_to_8i32: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE41-NEXT: pand .LCPI0_0(%rip), %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_8i16_to_8i32: @@ -158,34 +154,30 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) { ; SSE2-LABEL: zext_16i8_to_16i16: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: # kill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pand .LCPI3_0(%rip), %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_16i8_to_16i16: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: # kill +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: pand .LCPI3_0(%rip), %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_16i8_to_16i16: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pmovzxbw %xmm1, %xmm0 {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pand .LCPI3_0(%rip), %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_16i8_to_16i16: @@ -207,26 +199,24 @@ define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) { ; SSE2-LABEL: load_zext_16i8_to_16i16: -; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: retq +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pand .LCPI4_0(%rip), %xmm1 +; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_zext_16i8_to_16i16: -; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movdqa (%rdi), %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: retq +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: pand .LCPI4_0(%rip), %xmm1 +; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_zext_16i8_to_16i16: ; SSE41: # BB#0: # %entry @@ -253,26 +243,24 @@ define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) { ; SSE2-LABEL: load_zext_8i16_to_8i32: -; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: retq +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pand .LCPI5_0(%rip), %xmm1 +; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_zext_8i16_to_8i32: -; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movdqa (%rdi), %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: retq +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pand .LCPI5_0(%rip), %xmm1 +; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_zext_8i16_to_8i32: ; SSE41: # BB#0: # %entry