Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1124,12 +1124,12 @@ setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom); } - if (HasInt256) { - for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { - setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); - setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); - } + for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) { + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); + } + if (HasInt256) { // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); @@ -19713,18 +19713,20 @@ if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) return SDValue(); if (!(VT.is128BitVector() && Subtarget.hasSSE2()) && - !(VT.is256BitVector() && Subtarget.hasInt256()) && + !(VT.is256BitVector() && Subtarget.hasAVX()) && !(VT.is512BitVector() && Subtarget.hasAVX512())) return SDValue(); SDLoc dl(Op); + unsigned Opc = Op.getOpcode(); + unsigned NumElts = VT.getVectorNumElements(); // For 256-bit vectors, we only need the lower (128-bit) half of the input. // For 512-bit vectors, we need 128-bits or 256-bits. if (VT.getSizeInBits() > 128) { // Input needs to be at least the same number of elements as output, and // at least 128-bits. - int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements(); + int InSize = InSVT.getSizeInBits() * NumElts; In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128)); } @@ -19733,14 +19735,31 @@ // need to be handled here for 256/512-bit results. if (Subtarget.hasInt256()) { assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"); - unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? - X86ISD::VSEXT : X86ISD::VZEXT; + unsigned ExtOpc = + Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? X86ISD::VSEXT : X86ISD::VZEXT; return DAG.getNode(ExtOpc, dl, VT, In); } + // pre-AVX2 256-bit extensions need to be split into 128-bit instructions. + if (Subtarget.hasAVX()) { + assert(VT.is256BitVector() && "256-bit vector expected"); + int HalfNumElts = NumElts / 2; + MVT HalfVT = MVT::getVectorVT(SVT, HalfNumElts); + + InVT = In.getSimpleValueType(); + unsigned NumSrcElts = InVT.getVectorNumElements(); + SmallVector HiMask(NumSrcElts, SM_SentinelUndef); + for (int i = 0; i != HalfNumElts; ++i) + HiMask[i] = HalfNumElts + i; + + SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In); + SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask); + Hi = DAG.getNode(Opc, dl, HalfVT, Hi); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); + } + // We should only get here for sign extend. - assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && - "Unexpected opcode!"); + assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!"); // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. SDValue Curr = In; @@ -38316,9 +38335,9 @@ if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) return SDValue(); - // On AVX2+ targets, if the input/output types are both legal then we will be + // On AVX+ targets, if the input/output types are both legal then we will be // able to use SIGN_EXTEND/ZERO_EXTEND directly. - if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && + if (Subtarget.hasAVX() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && DAG.getTargetLoweringInfo().isTypeLegal(InVT)) return SDValue(); @@ -38346,11 +38365,11 @@ DAG.getIntPtrConstant(0, DL)); } - // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to + // If target-size is 128-bits (or 256-bits on AVX target), then convert to // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT. // Also use this if we don't have SSE41 to allow the legalizer do its job. if (!Subtarget.hasSSE41() || VT.is128BitVector() || - (VT.is256BitVector() && Subtarget.hasInt256()) || + (VT.is256BitVector() && Subtarget.hasAVX()) || (VT.is512BitVector() && Subtarget.useAVX512Regs())) { SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); return Opcode == ISD::SIGN_EXTEND @@ -38377,9 +38396,9 @@ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); }; - // On pre-AVX2 targets, split into 128-bit nodes of + // On pre-AVX targets, split into 128-bit nodes of // ISD::*_EXTEND_VECTOR_INREG. - if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) + if (!Subtarget.hasAVX() && !(VT.getSizeInBits() % 128)) return SplitAndExtendInReg(128); // On pre-AVX512 targets, split into 256-bit nodes of Index: test/CodeGen/X86/avg.ll =================================================================== --- test/CodeGen/X86/avg.ll +++ test/CodeGen/X86/avg.ll @@ -275,21 +275,21 @@ ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero @@ -301,7 +301,7 @@ ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm5, %xmm7, %xmm5 ; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm9 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,3] @@ -312,7 +312,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm7, %xmm12, %xmm12 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm5, %xmm13, %xmm13 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3] @@ -323,7 +323,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm4, %xmm14, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm6, %xmm10, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] @@ -2139,248 +2139,263 @@ ; AVX1-NEXT: pushq %r13 ; AVX1-NEXT: pushq %r12 ; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $24, %rsp -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX1-NEXT: subq $16, %rsp ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX1-NEXT: vpextrq $1, %xmm5, %rbx -; AVX1-NEXT: vmovq %xmm5, %rbp -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX1-NEXT: vpextrq $1, %xmm4, %rsi -; AVX1-NEXT: vmovq %xmm4, %rcx -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero -; AVX1-NEXT: vpextrq $1, %xmm4, %r8 -; AVX1-NEXT: vmovq %xmm4, %r11 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: movzbl 3(%rdi), %r14d +; AVX1-NEXT: movzbl 2(%rdi), %r15d +; AVX1-NEXT: movzbl 1(%rdi), %r12d +; AVX1-NEXT: movzbl (%rdi), %edx +; AVX1-NEXT: movzbl 7(%rdi), %ebx +; AVX1-NEXT: movzbl 6(%rdi), %ecx +; AVX1-NEXT: movzbl 5(%rdi), %eax +; AVX1-NEXT: movzbl 4(%rdi), %edi +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX1-NEXT: movzbl 3(%rsi), %ebp +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movzbl 2(%rsi), %ebp +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movzbl 1(%rsi), %ebp +; AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: movzbl (%rsi), %r9d +; AVX1-NEXT: movzbl 7(%rsi), %r13d +; AVX1-NEXT: movzbl 6(%rsi), %r10d +; AVX1-NEXT: vmovd %edi, %xmm1 +; AVX1-NEXT: movzbl 5(%rsi), %ebp +; AVX1-NEXT: movzbl 4(%rsi), %edi +; AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrd $3, %ebx, %xmm1, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm4, %rbx +; AVX1-NEXT: vmovq %xmm4, %rsi ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX1-NEXT: vpextrq $1, %xmm3, %r13 -; AVX1-NEXT: vmovq %xmm3, %r12 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero +; AVX1-NEXT: vpextrq $1, %xmm3, %rax +; AVX1-NEXT: vmovd %edx, %xmm4 +; AVX1-NEXT: vmovq %xmm3, %r11 +; AVX1-NEXT: vpinsrd $1, %r12d, %xmm4, %xmm3 +; AVX1-NEXT: vpinsrd $2, %r15d, %xmm3, %xmm3 +; AVX1-NEXT: vpinsrd $3, %r14d, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX1-NEXT: vpextrq $1, %xmm4, %r15 -; AVX1-NEXT: vmovq %xmm4, %rdi -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero ; AVX1-NEXT: vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX1-NEXT: vmovq %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX1-NEXT: vmovq %xmm3, %r10 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero -; AVX1-NEXT: vpextrq $1, %xmm4, %rdx -; AVX1-NEXT: addq %rbx, %rdx -; AVX1-NEXT: vmovq %xmm4, %r9 -; AVX1-NEXT: addq %rbp, %r9 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vmovd %edi, %xmm4 +; AVX1-NEXT: vmovq %xmm3, %rdi +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-NEXT: vpinsrd $1, %ebp, %xmm4, %xmm4 +; AVX1-NEXT: vpextrq $1, %xmm5, %r8 +; AVX1-NEXT: vpinsrd $2, %r10d, %xmm4, %xmm4 +; AVX1-NEXT: vmovq %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX1-NEXT: vpextrq $1, %xmm3, %rax -; AVX1-NEXT: addq %rsi, %rax -; AVX1-NEXT: movq %rax, %r14 -; AVX1-NEXT: vmovq %xmm3, %rbp -; AVX1-NEXT: addq %rcx, %rbp -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpextrq $1, %xmm3, %rsi -; AVX1-NEXT: addq %r8, %rsi -; AVX1-NEXT: vmovq %xmm3, %rax +; AVX1-NEXT: vpinsrd $3, %r13d, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm5, %r12 +; AVX1-NEXT: addq %rbx, %r12 +; AVX1-NEXT: vmovq %xmm5, %r10 +; AVX1-NEXT: addq %rsi, %r10 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX1-NEXT: vpextrq $1, %xmm4, %rbx +; AVX1-NEXT: addq %rax, %rbx +; AVX1-NEXT: movq %rbx, %r13 +; AVX1-NEXT: vmovq %xmm4, %rax ; AVX1-NEXT: addq %r11, %rax ; AVX1-NEXT: movq %rax, %r11 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: addq %r13, %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: vmovq %xmm2, %rax -; AVX1-NEXT: addq %r12, %rax -; AVX1-NEXT: movq %rax, %r8 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX1-NEXT: vpextrq $1, %xmm3, %rax -; AVX1-NEXT: addq %r15, %rax -; AVX1-NEXT: movq %rax, %rbx -; AVX1-NEXT: vmovq %xmm3, %rax -; AVX1-NEXT: addq %rdi, %rax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vmovd %r9d, %xmm4 +; AVX1-NEXT: vmovq %xmm3, %rbx +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vpinsrd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 4-byte Folded Reload +; AVX1-NEXT: vpextrq $1, %xmm3, %rdx +; AVX1-NEXT: vpinsrd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 4-byte Folded Reload +; AVX1-NEXT: vmovq %xmm3, %r14 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vmovq %xmm2, %rax -; AVX1-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; AVX1-NEXT: vpinsrd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 4-byte Folded Reload +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm4, %rcx +; AVX1-NEXT: addq %r15, %rcx +; AVX1-NEXT: vmovq %xmm4, %rbp +; AVX1-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; AVX1-NEXT: movq %rbp, %r15 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; AVX1-NEXT: vpextrq $1, %xmm3, %rsi +; AVX1-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; AVX1-NEXT: vmovq %xmm3, %r9 +; AVX1-NEXT: addq %rdi, %r9 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm4, %rdi +; AVX1-NEXT: addq %r8, %rdi +; AVX1-NEXT: movq %rdi, %r8 +; AVX1-NEXT: vmovq %xmm4, %rdi +; AVX1-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload +; AVX1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; AVX1-NEXT: vpextrq $1, %xmm3, %rdi +; AVX1-NEXT: addq %rax, %rdi +; AVX1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vmovq %xmm3, %rax +; AVX1-NEXT: addq %rbx, %rax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm1, %rbx +; AVX1-NEXT: addq %rdx, %rbx +; AVX1-NEXT: vmovq %xmm1, %rdi +; AVX1-NEXT: addq %r14, %rdi ; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vmovq %xmm2, %r12 -; AVX1-NEXT: addq %r10, %r12 -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpextrq $1, %xmm0, %r10 -; AVX1-NEXT: addq %rax, %r10 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vmovq %xmm0, %rdi -; AVX1-NEXT: addq %rax, %rdi -; AVX1-NEXT: addq $-1, %rdx +; AVX1-NEXT: vpextrq $1, %xmm0, %rdx +; AVX1-NEXT: addq %rax, %rdx ; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vmovq %xmm2, %rax +; AVX1-NEXT: vmovq %xmm0, %rdx +; AVX1-NEXT: addq %rax, %rdx +; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: addq $-1, %r12 +; AVX1-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: movl $0, %eax ; AVX1-NEXT: adcq $-1, %rax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: addq $-1, %r9 -; AVX1-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: movl $0, %eax -; AVX1-NEXT: adcq $-1, %rax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: addq $-1, %r14 -; AVX1-NEXT: movq %r14, (%rsp) # 8-byte Spill -; AVX1-NEXT: movl $0, %eax -; AVX1-NEXT: adcq $-1, %rax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: addq $-1, %rbp -; AVX1-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: addq $-1, %r10 +; AVX1-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: movl $0, %eax ; AVX1-NEXT: adcq $-1, %rax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: addq $-1, %rsi -; AVX1-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movq %rax, (%rsp) # 8-byte Spill +; AVX1-NEXT: addq $-1, %r13 +; AVX1-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: movl $0, %eax ; AVX1-NEXT: adcq $-1, %rax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: addq $-1, %r11 ; AVX1-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: movl $0, %eax -; AVX1-NEXT: adcq $-1, %rax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: adcq $-1, %rdx +; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: addq $-1, %rcx ; AVX1-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: adcq $-1, %rcx +; AVX1-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: addq $-1, %r15 +; AVX1-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: adcq $-1, %rcx +; AVX1-NEXT: addq $-1, %rsi +; AVX1-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: movl $0, %ebp ; AVX1-NEXT: adcq $-1, %rbp +; AVX1-NEXT: addq $-1, %r9 +; AVX1-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %r12d +; AVX1-NEXT: adcq $-1, %r12 ; AVX1-NEXT: addq $-1, %r8 ; AVX1-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: adcq $-1, %rdx +; AVX1-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX1-NEXT: movl $0, %r15d ; AVX1-NEXT: adcq $-1, %r15 -; AVX1-NEXT: addq $-1, %rbx -; AVX1-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: movl $0, %eax -; AVX1-NEXT: adcq $-1, %rax -; AVX1-NEXT: movq %rax, %rsi -; AVX1-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX1-NEXT: movl $0, %r13d -; AVX1-NEXT: adcq $-1, %r13 ; AVX1-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX1-NEXT: movl $0, %r14d ; AVX1-NEXT: adcq $-1, %r14 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX1-NEXT: addq $-1, %rdx +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX1-NEXT: addq $-1, %r13 ; AVX1-NEXT: movl $0, %r11d ; AVX1-NEXT: adcq $-1, %r11 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: addq $-1, %rax -; AVX1-NEXT: movl $0, %ebx -; AVX1-NEXT: adcq $-1, %rbx -; AVX1-NEXT: addq $-1, %r12 +; AVX1-NEXT: addq $-1, %rbx +; AVX1-NEXT: movl $0, %r10d +; AVX1-NEXT: adcq $-1, %r10 +; AVX1-NEXT: addq $-1, %rdi ; AVX1-NEXT: movl $0, %r9d ; AVX1-NEXT: adcq $-1, %r9 -; AVX1-NEXT: addq $-1, %r10 +; AVX1-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX1-NEXT: movl $0, %r8d ; AVX1-NEXT: adcq $-1, %r8 -; AVX1-NEXT: addq $-1, %rdi -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: adcq $-1, %rcx -; AVX1-NEXT: shldq $63, %rdi, %rcx -; AVX1-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: shldq $63, %r10, %r8 -; AVX1-NEXT: shldq $63, %r12, %r9 -; AVX1-NEXT: shldq $63, %rax, %rbx -; AVX1-NEXT: shldq $63, %rdx, %r11 +; AVX1-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX1-NEXT: movl $0, %esi +; AVX1-NEXT: adcq $-1, %rsi +; AVX1-NEXT: shldq $63, %rdi, %r9 +; AVX1-NEXT: shldq $63, %rbx, %r10 +; AVX1-NEXT: shldq $63, %r13, %r11 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AVX1-NEXT: shldq $63, %rdi, %r14 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AVX1-NEXT: shldq $63, %rdi, %r15 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AVX1-NEXT: shldq $63, %rdi, %rdx +; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX1-NEXT: shldq $63, %rdx, %r14 +; AVX1-NEXT: shldq $63, %rdx, %r12 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX1-NEXT: shldq $63, %rdx, %r13 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shldq $63, %rax, %rsi -; AVX1-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shldq $63, %rax, %r15 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shldq $63, %rax, %rbp -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shldq $63, %rax, %rsi +; AVX1-NEXT: shldq $63, %rdx, %rbp +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX1-NEXT: shldq $63, %rdx, %rcx +; AVX1-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX1-NEXT: shldq $63, %rcx, %rdx ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shldq $63, %rax, %rcx ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shldq $63, %rax, %rdi -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; AVX1-NEXT: shldq $63, %rdi, %rcx +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AVX1-NEXT: shldq $63, %rdi, %rax +; AVX1-NEXT: movq %rax, %r13 ; AVX1-NEXT: movq (%rsp), %rax # 8-byte Reload -; AVX1-NEXT: shldq $63, %rax, %r12 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shldq $63, %rax, %r10 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shldq $63, %rdx, %rax -; AVX1-NEXT: vmovq %rax, %xmm8 -; AVX1-NEXT: vmovq %r10, %xmm0 -; AVX1-NEXT: vmovq %r12, %xmm1 -; AVX1-NEXT: vmovq %rdi, %xmm11 -; AVX1-NEXT: vmovq %rcx, %xmm2 -; AVX1-NEXT: vmovq %rsi, %xmm13 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AVX1-NEXT: shldq $63, %rdi, %rax +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX1-NEXT: shldq $63, %rbx, %rdi +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX1-NEXT: shldq $63, %rbx, %rsi +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX1-NEXT: shldq $63, %rbx, %r8 +; AVX1-NEXT: vmovq %rdi, %xmm8 +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vmovq %r13, %xmm1 +; AVX1-NEXT: vmovq %rcx, %xmm11 +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 8-byte Folded Reload +; AVX1-NEXT: # xmm13 = mem[0],zero ; AVX1-NEXT: vmovq %rbp, %xmm14 -; AVX1-NEXT: vmovq %r15, %xmm15 +; AVX1-NEXT: vmovq %r12, %xmm15 ; AVX1-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 8-byte Folded Reload ; AVX1-NEXT: # xmm9 = mem[0],zero -; AVX1-NEXT: vmovq %r13, %xmm10 +; AVX1-NEXT: vmovq %r15, %xmm10 ; AVX1-NEXT: vmovq %r14, %xmm12 ; AVX1-NEXT: vmovq %r11, %xmm3 -; AVX1-NEXT: vmovq %rbx, %xmm4 +; AVX1-NEXT: vmovq %r10, %xmm4 ; AVX1-NEXT: vmovq %r9, %xmm5 ; AVX1-NEXT: vmovq %r8, %xmm6 -; AVX1-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload -; AVX1-NEXT: # xmm7 = mem[0],zero +; AVX1-NEXT: vmovq %rsi, %xmm7 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm0[0],xmm8[0] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm11[0],xmm1[0] -; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm0[0,2],xmm8[0,2] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm13[0],xmm2[0] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm15[0],xmm14[0] -; AVX1-NEXT: vshufps {{.*#+}} xmm11 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm11 = xmm1[0,2],xmm0[0,2] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpshufb %xmm1, %xmm8, %xmm0 ; AVX1-NEXT: vpshufb %xmm1, %xmm11, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm10[0],xmm9[0] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm12[0] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,2],xmm2[0,2] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm4[0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm6[0] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm3[0,2] ; AVX1-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: addq $16, %rsp ; AVX1-NEXT: popq %rbx ; AVX1-NEXT: popq %r12 ; AVX1-NEXT: popq %r13 Index: test/CodeGen/X86/cast-vsel.ll =================================================================== --- test/CodeGen/X86/cast-vsel.ll +++ test/CodeGen/X86/cast-vsel.ll @@ -93,15 +93,14 @@ ; AVX1-LABEL: zext: ; AVX1: # %bb.0: ; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext: Index: test/CodeGen/X86/madd.ll =================================================================== --- test/CodeGen/X86/madd.ll +++ test/CodeGen/X86/madd.ll @@ -2040,12 +2040,11 @@ ; ; AVX1-LABEL: pmaddwd_negative1: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 Index: test/CodeGen/X86/pmaddubsw.ll =================================================================== --- test/CodeGen/X86/pmaddubsw.ll +++ test/CodeGen/X86/pmaddubsw.ll @@ -332,33 +332,31 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa (%rsi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpmovsxbd %xmm3, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm5 -; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpmulld %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <9,11,13,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm5 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-NEXT: vpmulld %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: pmaddubsw_bad_extend: @@ -463,29 +461,29 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa (%rsi), %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[9,10,13,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,2,5,6,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpmulld %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpmulld %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[8,11,12,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[9,11,13,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: pmaddubsw_bad_indices: Index: test/CodeGen/X86/pr15267.ll =================================================================== --- test/CodeGen/X86/pr15267.ll +++ test/CodeGen/X86/pr15267.ll @@ -44,23 +44,22 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movzbl (%rdi), %eax ; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: shlq $62, %rcx +; CHECK-NEXT: shlq $60, %rcx ; CHECK-NEXT: sarq $63, %rcx -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: shlq $63, %rdx -; CHECK-NEXT: sarq $63, %rdx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; CHECK-NEXT: vmovq %rcx, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shlq $61, %rcx ; CHECK-NEXT: sarq $63, %rcx -; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; CHECK-NEXT: shlq $60, %rax +; CHECK-NEXT: vmovq %rcx, %xmm1 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shlq $62, %rcx +; CHECK-NEXT: sarq $63, %rcx +; CHECK-NEXT: vmovq %rcx, %xmm1 +; CHECK-NEXT: shlq $63, %rax ; CHECK-NEXT: sarq $63, %rax -; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 +; CHECK-NEXT: vmovq %rax, %xmm2 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %wide.load35 = load <4 x i1>, <4 x i1>* %in, align 1 Index: test/CodeGen/X86/psubus.ll =================================================================== --- test/CodeGen/X86/psubus.ll +++ test/CodeGen/X86/psubus.ll @@ -549,25 +549,25 @@ ; ; AVX1-LABEL: test13: ; AVX1: # %bb.0: # %vector.ph -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpmaxud %xmm5, %xmm2, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm2, %xmm6 -; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm0, %xmm6 +; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpandn %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -768,7 +768,7 @@ ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 ; AVX1-NEXT: vpmaxud %xmm0, %xmm6, %xmm7 @@ -934,25 +934,25 @@ ; ; AVX1-LABEL: test15: ; AVX1: # %bb.0: # %vector.ph -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpminud %xmm5, %xmm2, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm2, %xmm6 -; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm0, %xmm6 +; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1070,25 +1070,25 @@ ; ; AVX1-LABEL: test16: ; AVX1: # %bb.0: # %vector.ph -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpmaxud %xmm2, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm6 -; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm1, %xmm6 +; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1772,10 +1772,6 @@ ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; Index: test/CodeGen/X86/shrink_vmul.ll =================================================================== --- test/CodeGen/X86/shrink_vmul.ll +++ test/CodeGen/X86/shrink_vmul.ll @@ -2316,8 +2316,8 @@ ; X86-AVX1-NEXT: .cfi_def_cfa_offset 16 ; X86-AVX1-NEXT: pushl %esi ; X86-AVX1-NEXT: .cfi_def_cfa_offset 20 -; X86-AVX1-NEXT: subl $16, %esp -; X86-AVX1-NEXT: .cfi_def_cfa_offset 36 +; X86-AVX1-NEXT: subl $8, %esp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 28 ; X86-AVX1-NEXT: .cfi_offset %esi, -20 ; X86-AVX1-NEXT: .cfi_offset %edi, -16 ; X86-AVX1-NEXT: .cfi_offset %ebx, -12 @@ -2326,8 +2326,8 @@ ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX1-NEXT: vmovdqa (%eax), %ymm2 ; X86-AVX1-NEXT: vmovdqa (%ecx), %ymm1 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero @@ -2339,50 +2339,50 @@ ; X86-AVX1-NEXT: vpextrd $3, %xmm3, %eax ; X86-AVX1-NEXT: xorl %edx, %edx ; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-AVX1-NEXT: vpextrd $2, %xmm1, %ecx ; X86-AVX1-NEXT: vpextrd $2, %xmm3, %eax ; X86-AVX1-NEXT: xorl %edx, %edx ; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-AVX1-NEXT: movl %edx, %edi ; X86-AVX1-NEXT: vpextrd $1, %xmm1, %ecx ; X86-AVX1-NEXT: vpextrd $1, %xmm3, %eax ; X86-AVX1-NEXT: xorl %edx, %edx ; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-AVX1-NEXT: movl %edx, %ebx ; X86-AVX1-NEXT: vmovd %xmm1, %ecx ; X86-AVX1-NEXT: vmovd %xmm3, %eax ; X86-AVX1-NEXT: xorl %edx, %edx ; X86-AVX1-NEXT: divl %ecx ; X86-AVX1-NEXT: movl %edx, %ebp +; X86-AVX1-NEXT: vpextrd $3, %xmm0, %eax ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; X86-AVX1-NEXT: vpextrd $3, %xmm1, %ecx ; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: vpextrd $3, %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $3, %xmm1, %ecx ; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: movl %edx, %ebx +; X86-AVX1-NEXT: movl %edx, %ecx ; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: vpextrd $2, %xmm1, %esi ; X86-AVX1-NEXT: vpextrd $2, %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $2, %xmm1, %esi ; X86-AVX1-NEXT: divl %esi ; X86-AVX1-NEXT: movl %edx, %esi +; X86-AVX1-NEXT: vmovd %ebp, %xmm2 ; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: vpextrd $1, %xmm1, %edi ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %eax -; X86-AVX1-NEXT: divl %edi -; X86-AVX1-NEXT: movl %edx, %edi -; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: vmovd %xmm1, %ecx +; X86-AVX1-NEXT: vpextrd $1, %xmm1, %ebp +; X86-AVX1-NEXT: divl %ebp +; X86-AVX1-NEXT: movl %edx, %ebp +; X86-AVX1-NEXT: vpinsrd $1, %ebx, %xmm2, %xmm2 ; X86-AVX1-NEXT: vmovd %xmm0, %eax -; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: vmovd %edx, %xmm0 -; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpinsrd $3, %ebx, %xmm0, %xmm0 -; X86-AVX1-NEXT: vmovd %ebp, %xmm1 -; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload -; X86-AVX1-NEXT: vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; X86-AVX1-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; X86-AVX1-NEXT: vpinsrd $2, %edi, %xmm2, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm1, %edi +; X86-AVX1-NEXT: vpinsrd $3, (%esp), %xmm0, %xmm0 # 4-byte Folded Reload +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %edi +; X86-AVX1-NEXT: vmovd %edx, %xmm1 +; X86-AVX1-NEXT: vpinsrd $1, %ebp, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 ; X86-AVX1-NEXT: vmovd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 4-byte Folded Reload ; X86-AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero ; X86-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 @@ -2390,11 +2390,11 @@ ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199] ; X86-AVX1-NEXT: vpmulld %xmm4, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpmulld %xmm4, %xmm1, %xmm1 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm1 ; X86-AVX1-NEXT: vmovd %xmm1, (%eax) ; X86-AVX1-NEXT: vmovaps %ymm0, (%eax) -; X86-AVX1-NEXT: addl $16, %esp +; X86-AVX1-NEXT: addl $8, %esp ; X86-AVX1-NEXT: .cfi_def_cfa_offset 20 ; X86-AVX1-NEXT: popl %esi ; X86-AVX1-NEXT: .cfi_def_cfa_offset 16 @@ -2589,8 +2589,8 @@ ; X64-AVX1-NEXT: .cfi_offset %rbp, -16 ; X64-AVX1-NEXT: vmovdqa (%rdi), %ymm2 ; X64-AVX1-NEXT: vmovdqa (%rsi), %ymm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero @@ -2618,38 +2618,38 @@ ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl %ecx ; X64-AVX1-NEXT: movl %edx, %esi +; X64-AVX1-NEXT: vpextrd $3, %xmm0, %eax ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; X64-AVX1-NEXT: vpextrd $3, %xmm1, %ecx -; X64-AVX1-NEXT: vpextrd $3, %xmm0, %eax ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl %ecx ; X64-AVX1-NEXT: movl %edx, %edi -; X64-AVX1-NEXT: vpextrd $2, %xmm1, %ecx ; X64-AVX1-NEXT: vpextrd $2, %xmm0, %eax +; X64-AVX1-NEXT: vpextrd $2, %xmm1, %ecx ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl %ecx ; X64-AVX1-NEXT: movl %edx, %ecx -; X64-AVX1-NEXT: vpextrd $1, %xmm1, %ebx ; X64-AVX1-NEXT: vpextrd $1, %xmm0, %eax +; X64-AVX1-NEXT: vpextrd $1, %xmm1, %ebx ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl %ebx ; X64-AVX1-NEXT: movl %edx, %ebx -; X64-AVX1-NEXT: vmovd %xmm1, %ebp ; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vmovd %xmm1, %ebp ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl %ebp -; X64-AVX1-NEXT: vmovd %edx, %xmm0 -; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %esi, %xmm0 +; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vmovd %esi, %xmm2 -; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 -; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2 -; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vmovd %edx, %xmm2 +; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm2, %xmm2 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vmovd %r8d, %xmm1 ; X64-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 ; X64-AVX1-NEXT: vmovd %eax, %xmm2 Index: test/CodeGen/X86/v8i1-masks.ll =================================================================== --- test/CodeGen/X86/v8i1-masks.ll +++ test/CodeGen/X86/v8i1-masks.ll @@ -133,12 +133,8 @@ ; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X32-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; X32-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; X32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; X32-NEXT: vpand LCPI2_0, %xmm0, %xmm0 -; X32-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-NEXT: vandps LCPI2_0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: and_mask_constant: @@ -147,12 +143,8 @@ ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X64-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; X64-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; X64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; X64-NEXT: retq ; ; X32-AVX2-LABEL: and_mask_constant: Index: test/CodeGen/X86/vec_cast2.ll =================================================================== --- test/CodeGen/X86/vec_cast2.ll +++ test/CodeGen/X86/vec_cast2.ll @@ -87,10 +87,10 @@ ; CHECK-LABEL: cvt_v8u8_v8f32: ; CHECK: ## %bb.0: ; CHECK-NEXT: vpand LCPI4_0, %xmm0, %xmm0 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: retl ; @@ -109,19 +109,19 @@ define <8 x float> @cvt_v8u16_v8f32(<8 x i16> %src) { ; CHECK-LABEL: cvt_v8u16_v8f32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: retl ; ; CHECK-WIDE-LABEL: cvt_v8u16_v8f32: ; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-WIDE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; CHECK-WIDE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-WIDE-NEXT: retl %res = uitofp <8 x i16> %src to <8 x float> Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -2433,10 +2433,10 @@ ; ; AVX1-LABEL: uitofp_8i16_to_4f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper @@ -2972,10 +2972,10 @@ ; ; AVX1-LABEL: uitofp_8i16_to_8f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -5748,10 +5748,8 @@ ; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32: ; AVX1: # %bb.0: ; AVX1-NEXT: movq 24(%rdi), %rax -; AVX1-NEXT: vmovdqu 8(%rdi), %xmm0 -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd 16(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX1-NEXT: vmovaps %ymm0, (%rax) Index: test/CodeGen/X86/vector-sext.ll =================================================================== --- test/CodeGen/X86/vector-sext.ll +++ test/CodeGen/X86/vector-sext.ll @@ -345,7 +345,7 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -576,9 +576,9 @@ ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 ; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovsxbq %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -971,7 +971,7 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -1588,23 +1588,22 @@ ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: movzbl (%rdi), %eax ; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $62, %rcx +; AVX1-NEXT: shlq $60, %rcx ; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: shlq $63, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vmovd %edx, %xmm0 -; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %rcx, %xmm0 ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shlq $61, %rcx ; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: shlq $60, %rax +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $62, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: shlq $63, %rax ; AVX1-NEXT: sarq $63, %rax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; Index: test/CodeGen/X86/vector-zext.ll =================================================================== --- test/CodeGen/X86/vector-zext.ll +++ test/CodeGen/X86/vector-zext.ll @@ -64,10 +64,10 @@ ; ; AVX1-LABEL: zext_16i8_to_16i16: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_16i8_to_16i16: @@ -293,7 +293,7 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -445,9 +445,9 @@ ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -526,10 +526,10 @@ ; ; AVX1-LABEL: zext_8i16_to_8i32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_8i16_to_8i32: @@ -747,7 +747,7 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 @@ -825,10 +825,10 @@ ; ; AVX1-LABEL: zext_4i32_to_4i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_4i32_to_4i64: @@ -1540,10 +1540,10 @@ ; AVX1-LABEL: zext_8i8_to_8i32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_8i8_to_8i32: @@ -2225,12 +2225,12 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-NEXT: vmovaps %ymm4, %ymm0 Index: test/CodeGen/X86/vselect-avx.ll =================================================================== --- test/CodeGen/X86/vselect-avx.ll +++ test/CodeGen/X86/vselect-avx.ll @@ -41,7 +41,6 @@ ; AVX1-LABEL: test2: ; AVX1: ## %bb.0: ## %bb ; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0