Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -1124,12 +1124,12 @@ setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom); } - if (HasInt256) { - for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { - setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); - setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); - } + for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) { + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); + } + if (HasInt256) { // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); @@ -19713,18 +19713,20 @@ if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) return SDValue(); if (!(VT.is128BitVector() && Subtarget.hasSSE2()) && - !(VT.is256BitVector() && Subtarget.hasInt256()) && + !(VT.is256BitVector() && Subtarget.hasAVX()) && !(VT.is512BitVector() && Subtarget.hasAVX512())) return SDValue(); SDLoc dl(Op); + unsigned Opc = Op.getOpcode(); + unsigned NumElts = VT.getVectorNumElements(); // For 256-bit vectors, we only need the lower (128-bit) half of the input. // For 512-bit vectors, we need 128-bits or 256-bits. if (VT.getSizeInBits() > 128) { // Input needs to be at least the same number of elements as output, and // at least 128-bits. - int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements(); + int InSize = InSVT.getSizeInBits() * NumElts; In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128)); } @@ -19733,14 +19735,31 @@ // need to be handled here for 256/512-bit results. if (Subtarget.hasInt256()) { assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"); - unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? - X86ISD::VSEXT : X86ISD::VZEXT; + unsigned ExtOpc = + Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? X86ISD::VSEXT : X86ISD::VZEXT; return DAG.getNode(ExtOpc, dl, VT, In); } + // pre-AVX2 256-bit extensions need to be split into 128-bit instructions. + if (Subtarget.hasAVX()) { + assert(VT.is256BitVector() && "256-bit vector expected"); + int HalfNumElts = NumElts / 2; + MVT HalfVT = MVT::getVectorVT(SVT, HalfNumElts); + + InVT = In.getSimpleValueType(); + unsigned NumSrcElts = InVT.getVectorNumElements(); + SmallVector HiMask(NumSrcElts, SM_SentinelUndef); + for (int i = 0; i != HalfNumElts; ++i) + HiMask[i] = HalfNumElts + i; + + SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In); + SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask); + Hi = DAG.getNode(Opc, dl, HalfVT, Hi); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); + } + // We should only get here for sign extend. - assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && - "Unexpected opcode!"); + assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!"); // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. SDValue Curr = In; @@ -38346,11 +38365,11 @@ DAG.getIntPtrConstant(0, DL)); } - // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to + // If target-size is 128-bits (or 256-bits on AVX target), then convert to // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT. // Also use this if we don't have SSE41 to allow the legalizer do its job. if (!Subtarget.hasSSE41() || VT.is128BitVector() || - (VT.is256BitVector() && Subtarget.hasInt256()) || + (VT.is256BitVector() && Subtarget.hasAVX()) || (VT.is512BitVector() && Subtarget.useAVX512Regs())) { SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); return Opcode == ISD::SIGN_EXTEND @@ -38377,9 +38396,9 @@ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); }; - // On pre-AVX2 targets, split into 128-bit nodes of + // On pre-AVX targets, split into 128-bit nodes of // ISD::*_EXTEND_VECTOR_INREG. - if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) + if (!Subtarget.hasAVX() && !(VT.getSizeInBits() % 128)) return SplitAndExtendInReg(128); // On pre-AVX512 targets, split into 256-bit nodes of Index: llvm/trunk/test/CodeGen/X86/avg.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avg.ll +++ llvm/trunk/test/CodeGen/X86/avg.ll @@ -275,21 +275,21 @@ ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero @@ -301,7 +301,7 @@ ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm5, %xmm7, %xmm5 ; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm9 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,3] @@ -312,7 +312,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm7, %xmm12, %xmm12 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm5, %xmm13, %xmm13 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3] @@ -323,7 +323,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm4, %xmm14, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm6, %xmm10, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] Index: llvm/trunk/test/CodeGen/X86/pmaddubsw.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/pmaddubsw.ll +++ llvm/trunk/test/CodeGen/X86/pmaddubsw.ll @@ -332,33 +332,31 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa (%rsi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpmovsxbd %xmm3, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm5 -; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpmulld %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <9,11,13,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm5 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-NEXT: vpmulld %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: pmaddubsw_bad_extend: @@ -463,29 +461,29 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa (%rsi), %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[9,10,13,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,2,5,6,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpmulld %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpmulld %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[8,11,12,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[9,11,13,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: pmaddubsw_bad_indices: Index: llvm/trunk/test/CodeGen/X86/psubus.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/psubus.ll +++ llvm/trunk/test/CodeGen/X86/psubus.ll @@ -768,7 +768,7 @@ ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 ; AVX1-NEXT: vpmaxud %xmm0, %xmm6, %xmm7 @@ -1772,10 +1772,6 @@ ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; Index: llvm/trunk/test/CodeGen/X86/vector-sext.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-sext.ll +++ llvm/trunk/test/CodeGen/X86/vector-sext.ll @@ -345,7 +345,7 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -576,9 +576,9 @@ ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 ; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovsxbq %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -971,7 +971,7 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 Index: llvm/trunk/test/CodeGen/X86/vector-zext.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-zext.ll +++ llvm/trunk/test/CodeGen/X86/vector-zext.ll @@ -293,7 +293,7 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -445,9 +445,9 @@ ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -747,7 +747,7 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -2225,12 +2225,12 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-NEXT: vmovaps %ymm4, %ymm0