Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -31239,102 +31239,7 @@ if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget)) return MinMax; - // Only operate on vectors of 4 elements, where the alternative shuffling - // gets to be more expensive. - if (SrcVT != MVT::v4i32) - return SDValue(); - - // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a - // single use which is a sign-extend or zero-extend, and all elements are - // used. - SmallVector Uses; - unsigned ExtractedElements = 0; - for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), - UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { - if (UI.getUse().getResNo() != InputVector.getResNo()) - return SDValue(); - - SDNode *Extract = *UI; - if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return SDValue(); - - if (Extract->getValueType(0) != MVT::i32) - return SDValue(); - if (!Extract->hasOneUse()) - return SDValue(); - if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && - Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) - return SDValue(); - if (!isa(Extract->getOperand(1))) - return SDValue(); - - // Record which element was extracted. - ExtractedElements |= 1 << Extract->getConstantOperandVal(1); - Uses.push_back(Extract); - } - - // If not all the elements were used, this may not be worthwhile. - if (ExtractedElements != 15) - return SDValue(); - - // Ok, we've now decided to do the transformation. - // If 64-bit shifts are legal, use the extract-shift sequence, - // otherwise bounce the vector off the cache. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue Vals[4]; - - if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) { - SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector); - auto &DL = DAG.getDataLayout(); - EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL); - SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, - DAG.getConstant(0, dl, VecIdxTy)); - SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, - DAG.getConstant(1, dl, VecIdxTy)); - - SDValue ShAmt = DAG.getConstant( - 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL)); - Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf); - Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, - DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt)); - Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf); - Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, - DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt)); - } else { - // Store the value to a temporary stack slot. - SDValue StackPtr = DAG.CreateStackTemporary(SrcVT); - SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, - MachinePointerInfo()); - - EVT ElementType = SrcVT.getVectorElementType(); - unsigned EltSize = ElementType.getSizeInBits() / 8; - - // Replace each use (extract) with a load of the appropriate element. - for (unsigned i = 0; i < 4; ++i) { - uint64_t Offset = EltSize * i; - auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); - SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT); - - SDValue ScalarAddr = - DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal); - - // Load the scalar. - Vals[i] = - DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo()); - } - } - - // Replace the extracts - for (SmallVectorImpl::iterator UI = Uses.begin(), - UE = Uses.end(); UI != UE; ++UI) { - SDNode *Extract = *UI; - - uint64_t IdxVal = Extract->getConstantOperandVal(1); - DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]); - } - - // The replacement was made in place; return N so it won't be revisited. - return SDValue(N, 0); + return SDValue(); } /// If a vector select has an operand that is -1 or 0, try to simplify the Index: llvm/trunk/test/CodeGen/X86/gather-addresses.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/gather-addresses.ll +++ llvm/trunk/test/CodeGen/X86/gather-addresses.ll @@ -7,21 +7,24 @@ ; rdar://7398554 ; When doing vector gather-scatter index calculation with 32-bit indices, -; use an efficient mov/shift sequence rather than shuffling each individual -; element out of the index vector. +; minimize shuffling of each individual element out of the index vector. define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind { ; LIN-SSE2-LABEL: foo: ; LIN-SSE2: # %bb.0: ; LIN-SSE2-NEXT: movdqa (%rsi), %xmm0 ; LIN-SSE2-NEXT: pand (%rdx), %xmm0 +; LIN-SSE2-NEXT: movd %xmm0, %eax +; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; LIN-SSE2-NEXT: movd %xmm1, %ecx ; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; LIN-SSE2-NEXT: movq %xmm1, %rax -; LIN-SSE2-NEXT: movq %xmm0, %rcx -; LIN-SSE2-NEXT: movslq %ecx, %rdx -; LIN-SSE2-NEXT: sarq $32, %rcx -; LIN-SSE2-NEXT: movslq %eax, %rsi -; LIN-SSE2-NEXT: sarq $32, %rax +; LIN-SSE2-NEXT: movd %xmm1, %edx +; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; LIN-SSE2-NEXT: movd %xmm0, %esi +; LIN-SSE2-NEXT: cltq +; LIN-SSE2-NEXT: movslq %ecx, %rcx +; LIN-SSE2-NEXT: movslq %edx, %rdx +; LIN-SSE2-NEXT: movslq %esi, %rsi ; LIN-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; LIN-SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; LIN-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero @@ -32,14 +35,16 @@ ; LIN-SSE4: # %bb.0: ; LIN-SSE4-NEXT: movdqa (%rsi), %xmm0 ; LIN-SSE4-NEXT: pand (%rdx), %xmm0 -; LIN-SSE4-NEXT: pextrq $1, %xmm0, %rax -; LIN-SSE4-NEXT: movq %xmm0, %rcx -; LIN-SSE4-NEXT: movslq %ecx, %rdx -; LIN-SSE4-NEXT: sarq $32, %rcx -; LIN-SSE4-NEXT: movslq %eax, %rsi +; LIN-SSE4-NEXT: movd %xmm0, %eax +; LIN-SSE4-NEXT: pextrd $1, %xmm0, %ecx +; LIN-SSE4-NEXT: pextrd $2, %xmm0, %edx +; LIN-SSE4-NEXT: pextrd $3, %xmm0, %esi +; LIN-SSE4-NEXT: cltq +; LIN-SSE4-NEXT: movslq %ecx, %rcx +; LIN-SSE4-NEXT: movslq %edx, %rdx ; LIN-SSE4-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; LIN-SSE4-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; LIN-SSE4-NEXT: sarq $32, %rax +; LIN-SSE4-NEXT: movslq %esi, %rax ; LIN-SSE4-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; LIN-SSE4-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; LIN-SSE4-NEXT: retq @@ -48,13 +53,17 @@ ; WIN-SSE2: # %bb.0: ; WIN-SSE2-NEXT: movdqa (%rdx), %xmm0 ; WIN-SSE2-NEXT: pand (%r8), %xmm0 +; WIN-SSE2-NEXT: movd %xmm0, %r8d +; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; WIN-SSE2-NEXT: movd %xmm1, %r9d ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; WIN-SSE2-NEXT: movq %xmm1, %rax -; WIN-SSE2-NEXT: movq %xmm0, %rdx -; WIN-SSE2-NEXT: movslq %edx, %r8 -; WIN-SSE2-NEXT: sarq $32, %rdx -; WIN-SSE2-NEXT: movslq %eax, %r9 -; WIN-SSE2-NEXT: sarq $32, %rax +; WIN-SSE2-NEXT: movd %xmm1, %r10d +; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; WIN-SSE2-NEXT: movd %xmm0, %edx +; WIN-SSE2-NEXT: movslq %r8d, %rax +; WIN-SSE2-NEXT: movslq %r9d, %r8 +; WIN-SSE2-NEXT: movslq %r10d, %r9 +; WIN-SSE2-NEXT: movslq %edx, %rdx ; WIN-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; WIN-SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; WIN-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero @@ -65,14 +74,16 @@ ; WIN-SSE4: # %bb.0: ; WIN-SSE4-NEXT: movdqa (%rdx), %xmm0 ; WIN-SSE4-NEXT: pand (%r8), %xmm0 -; WIN-SSE4-NEXT: pextrq $1, %xmm0, %rax -; WIN-SSE4-NEXT: movq %xmm0, %rdx -; WIN-SSE4-NEXT: movslq %edx, %r8 -; WIN-SSE4-NEXT: sarq $32, %rdx -; WIN-SSE4-NEXT: movslq %eax, %r9 +; WIN-SSE4-NEXT: movd %xmm0, %eax +; WIN-SSE4-NEXT: pextrd $1, %xmm0, %edx +; WIN-SSE4-NEXT: pextrd $2, %xmm0, %r8d +; WIN-SSE4-NEXT: pextrd $3, %xmm0, %r9d +; WIN-SSE4-NEXT: cltq +; WIN-SSE4-NEXT: movslq %edx, %rdx +; WIN-SSE4-NEXT: movslq %r8d, %r8 ; WIN-SSE4-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; WIN-SSE4-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; WIN-SSE4-NEXT: sarq $32, %rax +; WIN-SSE4-NEXT: movslq %r9d, %rax ; WIN-SSE4-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; WIN-SSE4-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; WIN-SSE4-NEXT: retq @@ -127,22 +138,22 @@ ; LIN-SSE2: # %bb.0: ; LIN-SSE2-NEXT: movdqa (%rsi), %xmm0 ; LIN-SSE2-NEXT: pand (%rdx), %xmm0 +; LIN-SSE2-NEXT: movd %xmm0, %eax +; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; LIN-SSE2-NEXT: movd %xmm1, %edx ; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; LIN-SSE2-NEXT: movq %xmm1, %rax -; LIN-SSE2-NEXT: movq %rax, %rdx -; LIN-SSE2-NEXT: shrq $32, %rdx -; LIN-SSE2-NEXT: movq %xmm0, %rsi -; LIN-SSE2-NEXT: movq %rsi, %rdi -; LIN-SSE2-NEXT: shrq $32, %rdi -; LIN-SSE2-NEXT: andl %ecx, %esi -; LIN-SSE2-NEXT: andl %ecx, %eax -; LIN-SSE2-NEXT: andq %rcx, %rdi +; LIN-SSE2-NEXT: movd %xmm1, %esi +; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; LIN-SSE2-NEXT: movd %xmm0, %edi +; LIN-SSE2-NEXT: andq %rcx, %rax ; LIN-SSE2-NEXT: andq %rcx, %rdx -; LIN-SSE2-NEXT: movq %rdi, %xmm1 -; LIN-SSE2-NEXT: movq %rsi, %xmm0 +; LIN-SSE2-NEXT: andq %rcx, %rsi +; LIN-SSE2-NEXT: andq %rcx, %rdi +; LIN-SSE2-NEXT: movq %rax, %xmm0 +; LIN-SSE2-NEXT: movq %rdx, %xmm1 ; LIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; LIN-SSE2-NEXT: movq %rdx, %xmm2 -; LIN-SSE2-NEXT: movq %rax, %xmm1 +; LIN-SSE2-NEXT: movq %rdi, %xmm2 +; LIN-SSE2-NEXT: movq %rsi, %xmm1 ; LIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; LIN-SSE2-NEXT: retq ; @@ -150,21 +161,19 @@ ; LIN-SSE4: # %bb.0: ; LIN-SSE4-NEXT: movdqa (%rsi), %xmm0 ; LIN-SSE4-NEXT: pand (%rdx), %xmm0 -; LIN-SSE4-NEXT: pextrq $1, %xmm0, %rax -; LIN-SSE4-NEXT: movq %rax, %rdx -; LIN-SSE4-NEXT: shrq $32, %rdx -; LIN-SSE4-NEXT: movq %xmm0, %rsi -; LIN-SSE4-NEXT: movq %rsi, %rdi -; LIN-SSE4-NEXT: shrq $32, %rdi -; LIN-SSE4-NEXT: andl %ecx, %esi -; LIN-SSE4-NEXT: andl %ecx, %eax -; LIN-SSE4-NEXT: andq %rcx, %rdi +; LIN-SSE4-NEXT: movd %xmm0, %eax +; LIN-SSE4-NEXT: pextrd $1, %xmm0, %edx +; LIN-SSE4-NEXT: pextrd $2, %xmm0, %esi +; LIN-SSE4-NEXT: pextrd $3, %xmm0, %edi +; LIN-SSE4-NEXT: andq %rcx, %rax ; LIN-SSE4-NEXT: andq %rcx, %rdx -; LIN-SSE4-NEXT: movq %rdi, %xmm1 -; LIN-SSE4-NEXT: movq %rsi, %xmm0 +; LIN-SSE4-NEXT: andq %rcx, %rsi +; LIN-SSE4-NEXT: andq %rcx, %rdi +; LIN-SSE4-NEXT: movq %rdx, %xmm1 +; LIN-SSE4-NEXT: movq %rax, %xmm0 ; LIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; LIN-SSE4-NEXT: movq %rdx, %xmm2 -; LIN-SSE4-NEXT: movq %rax, %xmm1 +; LIN-SSE4-NEXT: movq %rdi, %xmm2 +; LIN-SSE4-NEXT: movq %rsi, %xmm1 ; LIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; LIN-SSE4-NEXT: retq ; @@ -172,21 +181,21 @@ ; WIN-SSE2: # %bb.0: ; WIN-SSE2-NEXT: movdqa (%rdx), %xmm0 ; WIN-SSE2-NEXT: pand (%r8), %xmm0 +; WIN-SSE2-NEXT: movd %xmm0, %eax +; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; WIN-SSE2-NEXT: movd %xmm1, %ecx ; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; WIN-SSE2-NEXT: movq %xmm1, %r8 -; WIN-SSE2-NEXT: movq %r8, %rcx -; WIN-SSE2-NEXT: shrq $32, %rcx -; WIN-SSE2-NEXT: movq %xmm0, %rax -; WIN-SSE2-NEXT: movq %rax, %rdx -; WIN-SSE2-NEXT: shrq $32, %rdx -; WIN-SSE2-NEXT: andl %r9d, %eax -; WIN-SSE2-NEXT: andl %r9d, %r8d -; WIN-SSE2-NEXT: andq %r9, %rdx +; WIN-SSE2-NEXT: movd %xmm1, %r8d +; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; WIN-SSE2-NEXT: movd %xmm0, %edx +; WIN-SSE2-NEXT: andq %r9, %rax ; WIN-SSE2-NEXT: andq %r9, %rcx -; WIN-SSE2-NEXT: movq %rdx, %xmm1 +; WIN-SSE2-NEXT: andq %r9, %r8 +; WIN-SSE2-NEXT: andq %r9, %rdx ; WIN-SSE2-NEXT: movq %rax, %xmm0 +; WIN-SSE2-NEXT: movq %rcx, %xmm1 ; WIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; WIN-SSE2-NEXT: movq %rcx, %xmm2 +; WIN-SSE2-NEXT: movq %rdx, %xmm2 ; WIN-SSE2-NEXT: movq %r8, %xmm1 ; WIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; WIN-SSE2-NEXT: retq @@ -195,53 +204,47 @@ ; WIN-SSE4: # %bb.0: ; WIN-SSE4-NEXT: movdqa (%rdx), %xmm0 ; WIN-SSE4-NEXT: pand (%r8), %xmm0 -; WIN-SSE4-NEXT: pextrq $1, %xmm0, %r8 -; WIN-SSE4-NEXT: movq %r8, %rcx -; WIN-SSE4-NEXT: shrq $32, %rcx -; WIN-SSE4-NEXT: movq %xmm0, %rax -; WIN-SSE4-NEXT: movq %rax, %rdx -; WIN-SSE4-NEXT: shrq $32, %rdx -; WIN-SSE4-NEXT: andl %r9d, %eax -; WIN-SSE4-NEXT: andl %r9d, %r8d -; WIN-SSE4-NEXT: andq %r9, %rdx +; WIN-SSE4-NEXT: movd %xmm0, %eax +; WIN-SSE4-NEXT: pextrd $1, %xmm0, %ecx +; WIN-SSE4-NEXT: pextrd $2, %xmm0, %r8d +; WIN-SSE4-NEXT: pextrd $3, %xmm0, %edx +; WIN-SSE4-NEXT: andq %r9, %rax ; WIN-SSE4-NEXT: andq %r9, %rcx -; WIN-SSE4-NEXT: movq %rdx, %xmm1 +; WIN-SSE4-NEXT: andq %r9, %r8 +; WIN-SSE4-NEXT: andq %r9, %rdx +; WIN-SSE4-NEXT: movq %rcx, %xmm1 ; WIN-SSE4-NEXT: movq %rax, %xmm0 ; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; WIN-SSE4-NEXT: movq %rcx, %xmm2 +; WIN-SSE4-NEXT: movq %rdx, %xmm2 ; WIN-SSE4-NEXT: movq %r8, %xmm1 ; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; WIN-SSE4-NEXT: retq ; ; LIN32-LABEL: old: ; LIN32: # %bb.0: -; LIN32-NEXT: pushl %ebp -; LIN32-NEXT: movl %esp, %ebp +; LIN32-NEXT: pushl %edi ; LIN32-NEXT: pushl %esi -; LIN32-NEXT: andl $-16, %esp -; LIN32-NEXT: subl $32, %esp -; LIN32-NEXT: movl 20(%ebp), %eax -; LIN32-NEXT: movl 16(%ebp), %ecx -; LIN32-NEXT: movl 12(%ebp), %edx -; LIN32-NEXT: movaps (%edx), %xmm0 -; LIN32-NEXT: andps (%ecx), %xmm0 -; LIN32-NEXT: movaps %xmm0, (%esp) -; LIN32-NEXT: movl (%esp), %ecx -; LIN32-NEXT: andl %eax, %ecx +; LIN32-NEXT: movl {{[0-9]+}}(%esp), %eax +; LIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; LIN32-NEXT: movdqa (%edx), %xmm0 +; LIN32-NEXT: pand (%ecx), %xmm0 +; LIN32-NEXT: movd %xmm0, %ecx +; LIN32-NEXT: pextrd $1, %xmm0, %edx +; LIN32-NEXT: pextrd $2, %xmm0, %esi +; LIN32-NEXT: pextrd $3, %xmm0, %edi +; LIN32-NEXT: andl %eax, %ecx ; LIN32-NEXT: andl %eax, %edx -; LIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; LIN32-NEXT: andl %eax, %esi -; LIN32-NEXT: andl {{[0-9]+}}(%esp), %eax +; LIN32-NEXT: andl %eax, %edi ; LIN32-NEXT: movd %edx, %xmm1 ; LIN32-NEXT: movd %ecx, %xmm0 ; LIN32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; LIN32-NEXT: movd %eax, %xmm2 +; LIN32-NEXT: movd %edi, %xmm2 ; LIN32-NEXT: movd %esi, %xmm1 ; LIN32-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; LIN32-NEXT: leal -4(%ebp), %esp ; LIN32-NEXT: popl %esi -; LIN32-NEXT: popl %ebp +; LIN32-NEXT: popl %edi ; LIN32-NEXT: retl %a = load <4 x i32>, <4 x i32>* %i %b = load <4 x i32>, <4 x i32>* %h Index: llvm/trunk/test/CodeGen/X86/mulvi32.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/mulvi32.ll +++ llvm/trunk/test/CodeGen/X86/mulvi32.ll @@ -153,109 +153,51 @@ define <4 x i64> @_mul4xi32toi64a(<4 x i32>, <4 x i32>) { ; SSE2-LABEL: _mul4xi32toi64a: ; SSE2: # %bb.0: -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm1, %rcx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: shrq $32, %rcx -; SSE2-NEXT: movq %xmm0, %rdx -; SSE2-NEXT: movd %edx, %xmm2 -; SSE2-NEXT: shrq $32, %rdx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rsi -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: shrq $32, %rsi -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: shrq $32, %rax -; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE2-NEXT: pmuludq %xmm4, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE42-LABEL: _mul4xi32toi64a: ; SSE42: # %bb.0: -; SSE42-NEXT: movq %xmm1, %rax -; SSE42-NEXT: pextrq $1, %xmm1, %rcx -; SSE42-NEXT: movd %ecx, %xmm1 -; SSE42-NEXT: shrq $32, %rcx -; SSE42-NEXT: movq %xmm0, %rdx -; SSE42-NEXT: movd %edx, %xmm2 -; SSE42-NEXT: shrq $32, %rdx -; SSE42-NEXT: pextrq $1, %xmm0, %rsi -; SSE42-NEXT: movd %esi, %xmm3 -; SSE42-NEXT: shrq $32, %rsi -; SSE42-NEXT: movd %esi, %xmm0 -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE42-NEXT: movd %edx, %xmm0 -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE42-NEXT: movd %ecx, %xmm0 -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE42-NEXT: movd %eax, %xmm0 -; SSE42-NEXT: shrq $32, %rax -; SSE42-NEXT: pmuludq %xmm3, %xmm1 -; SSE42-NEXT: movd %eax, %xmm3 -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE42-NEXT: pmuludq %xmm2, %xmm0 +; SSE42-NEXT: pxor %xmm3, %xmm3 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE42-NEXT: pmuludq %xmm0, %xmm1 +; SSE42-NEXT: pmuludq %xmm4, %xmm2 +; SSE42-NEXT: movdqa %xmm2, %xmm0 ; SSE42-NEXT: retq ; ; AVX1-LABEL: _mul4xi32toi64a: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: shrq $32, %rax -; AVX1-NEXT: vmovq %xmm1, %rcx -; AVX1-NEXT: vpextrq $1, %xmm0, %rdx -; AVX1-NEXT: vmovd %edx, %xmm0 -; AVX1-NEXT: shrq $32, %rdx -; AVX1-NEXT: vpextrq $1, %xmm1, %rsi -; AVX1-NEXT: vmovd %esi, %xmm1 -; AVX1-NEXT: shrq $32, %rsi -; AVX1-NEXT: vmovd %esi, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; AVX1-NEXT: vmovd %edx, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; AVX1-NEXT: vmovd %ecx, %xmm3 -; AVX1-NEXT: shrq $32, %rcx +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %ecx, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: _mul4xi32toi64a: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: shrq $32, %rax -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: vmovq %xmm0, %rdx -; AVX2-NEXT: vmovd %edx, %xmm1 -; AVX2-NEXT: shrq $32, %rdx -; AVX2-NEXT: vpextrq $1, %xmm0, %rsi -; AVX2-NEXT: vmovd %esi, %xmm0 -; AVX2-NEXT: shrq $32, %rsi -; AVX2-NEXT: vmovd %esi, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; AVX2-NEXT: vmovd %edx, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; AVX2-NEXT: vmovd %ecx, %xmm3 -; AVX2-NEXT: shrq $32, %rcx -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovd %ecx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] -; AVX2-NEXT: vmovd %eax, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %f00 = extractelement <4 x i32> %0, i32 0 Index: llvm/trunk/test/CodeGen/X86/pr18344.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/pr18344.ll +++ llvm/trunk/test/CodeGen/X86/pr18344.ll @@ -36,12 +36,14 @@ ; X64: # %bb.0: # %begin ; X64-NEXT: movdqu (%rdx), %xmm0 ; X64-NEXT: pslld $4, %xmm0 -; X64-NEXT: movq %xmm0, %rax +; X64-NEXT: movd %xmm0, %eax ; X64-NEXT: movslq %eax, %r8 -; X64-NEXT: sarq $32, %rax -; X64-NEXT: pextrq $1, %xmm0, %rdx -; X64-NEXT: movslq %edx, %rcx -; X64-NEXT: sarq $32, %rdx +; X64-NEXT: pextrd $1, %xmm0, %ecx +; X64-NEXT: movslq %ecx, %rcx +; X64-NEXT: pextrd $2, %xmm0, %edx +; X64-NEXT: movslq %edx, %rdx +; X64-NEXT: pextrd $3, %xmm0, %eax +; X64-NEXT: cltq ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero Index: llvm/trunk/test/CodeGen/X86/pr21792.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/pr21792.ll +++ llvm/trunk/test/CodeGen/X86/pr21792.ll @@ -12,19 +12,16 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pextrq $1, %xmm0, %rax -; CHECK-NEXT: movzwl %ax, %ecx -; CHECK-NEXT: shrq $32, %rax -; CHECK-NEXT: movq %xmm0, %rdx -; CHECK-NEXT: movzwl %dx, %r8d -; CHECK-NEXT: movq %rdx, %r9 -; CHECK-NEXT: shrq $32, %r9 +; CHECK-NEXT: movd %xmm0, %r8d ; CHECK-NEXT: leaq stuff(%r8), %rdi -; CHECK-NEXT: leaq stuff(%r9), %rsi -; CHECK-NEXT: leaq stuff(%rcx), %rdx -; CHECK-NEXT: leaq stuff(%rax), %rcx +; CHECK-NEXT: pextrd $1, %xmm0, %eax +; CHECK-NEXT: leaq stuff(%rax), %rsi +; CHECK-NEXT: pextrd $2, %xmm0, %edx +; CHECK-NEXT: pextrd $3, %xmm0, %ecx +; CHECK-NEXT: leaq stuff(%rdx), %rdx +; CHECK-NEXT: leaq stuff(%rcx), %rcx ; CHECK-NEXT: leaq stuff+8(%r8), %r8 -; CHECK-NEXT: leaq stuff+8(%r9), %r9 +; CHECK-NEXT: leaq stuff+8(%rax), %r9 ; CHECK-NEXT: callq toto ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq Index: llvm/trunk/test/CodeGen/X86/var-permute-128.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/var-permute-128.ll +++ llvm/trunk/test/CodeGen/X86/var-permute-128.ll @@ -37,44 +37,42 @@ define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind { ; SSSE3-LABEL: var_shuffle_v4i32: ; SSSE3: # %bb.0: +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; SSSE3-NEXT: movd %xmm2, %ecx ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSSE3-NEXT: movq %xmm2, %rax -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: sarq $32, %rcx -; SSSE3-NEXT: movq %xmm1, %rdx -; SSSE3-NEXT: movq %rdx, %rsi -; SSSE3-NEXT: sarq $32, %rsi -; SSSE3-NEXT: andl $3, %edx +; SSSE3-NEXT: movd %xmm2, %edx +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %esi ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: andl $3, %esi ; SSSE3-NEXT: andl $3, %eax ; SSSE3-NEXT: andl $3, %ecx +; SSSE3-NEXT: andl $3, %edx +; SSSE3-NEXT: andl $3, %esi ; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: retq ; ; AVX-LABEL: var_shuffle_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpextrq $1, %xmm1, %rax -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: sarq $32, %rcx -; AVX-NEXT: vmovq %xmm1, %rdx -; AVX-NEXT: movq %rdx, %rsi -; AVX-NEXT: sarq $32, %rsi -; AVX-NEXT: andl $3, %edx +; AVX-NEXT: vmovd %xmm1, %eax +; AVX-NEXT: vpextrd $1, %xmm1, %ecx +; AVX-NEXT: vpextrd $2, %xmm1, %edx +; AVX-NEXT: vpextrd $3, %xmm1, %esi ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $3, %esi ; AVX-NEXT: andl $3, %eax ; AVX-NEXT: andl $3, %ecx +; AVX-NEXT: andl $3, %edx +; AVX-NEXT: andl $3, %esi ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0 -; AVX-NEXT: vpinsrd $2, -24(%rsp,%rax,4), %xmm0, %xmm0 -; AVX-NEXT: vpinsrd $3, -24(%rsp,%rcx,4), %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $3, -24(%rsp,%rsi,4), %xmm0, %xmm0 ; AVX-NEXT: retq %index0 = extractelement <4 x i32> %indices, i32 0 %index1 = extractelement <4 x i32> %indices, i32 1 @@ -287,40 +285,38 @@ define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind { ; SSSE3-LABEL: var_shuffle_v4f32: ; SSSE3: # %bb.0: +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; SSSE3-NEXT: movd %xmm2, %ecx ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSSE3-NEXT: movq %xmm2, %rax -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: sarq $32, %rcx -; SSSE3-NEXT: movq %xmm1, %rdx -; SSSE3-NEXT: movq %rdx, %rsi -; SSSE3-NEXT: sarq $32, %rsi -; SSSE3-NEXT: andl $3, %edx +; SSSE3-NEXT: movd %xmm2, %edx +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %esi ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: andl $3, %esi ; SSSE3-NEXT: andl $3, %eax ; SSSE3-NEXT: andl $3, %ecx +; SSSE3-NEXT: andl $3, %edx +; SSSE3-NEXT: andl $3, %esi ; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: retq ; ; AVX-LABEL: var_shuffle_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpextrq $1, %xmm1, %rax -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: sarq $32, %rcx -; AVX-NEXT: vmovq %xmm1, %rdx -; AVX-NEXT: movq %rdx, %rsi -; AVX-NEXT: sarq $32, %rsi -; AVX-NEXT: andl $3, %edx +; AVX-NEXT: vmovd %xmm1, %eax +; AVX-NEXT: vpextrd $1, %xmm1, %ecx +; AVX-NEXT: vpextrd $2, %xmm1, %edx +; AVX-NEXT: vpextrd $3, %xmm1, %esi ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $3, %esi ; AVX-NEXT: andl $3, %eax ; AVX-NEXT: andl $3, %ecx +; AVX-NEXT: andl $3, %edx +; AVX-NEXT: andl $3, %esi ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] Index: llvm/trunk/test/CodeGen/X86/var-permute-256.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/var-permute-256.ll +++ llvm/trunk/test/CodeGen/X86/var-permute-256.ll @@ -119,36 +119,32 @@ ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp ; AVX1-NEXT: subq $64, %rsp -; AVX1-NEXT: vpextrq $1, %xmm1, %r8 -; AVX1-NEXT: movq %r8, %rcx -; AVX1-NEXT: shrq $30, %rcx -; AVX1-NEXT: vmovq %xmm1, %r9 -; AVX1-NEXT: movq %r9, %rsi -; AVX1-NEXT: shrq $30, %rsi +; AVX1-NEXT: vmovd %xmm1, %r8d +; AVX1-NEXT: vpextrd $1, %xmm1, %r9d +; AVX1-NEXT: vpextrd $2, %xmm1, %r10d +; AVX1-NEXT: vpextrd $3, %xmm1, %esi ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %r10 -; AVX1-NEXT: movq %r10, %rdi -; AVX1-NEXT: shrq $30, %rdi -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: shrq $30, %rdx +; AVX1-NEXT: vmovd %xmm1, %edi +; AVX1-NEXT: vpextrd $1, %xmm1, %eax +; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: vpextrd $3, %xmm1, %edx ; AVX1-NEXT: vmovaps %ymm0, (%rsp) -; AVX1-NEXT: andl $7, %r9d -; AVX1-NEXT: andl $28, %esi ; AVX1-NEXT: andl $7, %r8d -; AVX1-NEXT: andl $28, %ecx -; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: andl $28, %edx +; AVX1-NEXT: andl $7, %r9d ; AVX1-NEXT: andl $7, %r10d -; AVX1-NEXT: andl $28, %edi +; AVX1-NEXT: andl $7, %esi +; AVX1-NEXT: andl $7, %edi +; AVX1-NEXT: andl $7, %eax +; AVX1-NEXT: andl $7, %ecx +; AVX1-NEXT: andl $7, %edx ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpinsrd $1, (%rsp,%rdx), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $3, (%rsp,%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $1, (%rsp,%rax,4), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $2, (%rsp,%rcx,4), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $3, (%rsp,%rdx,4), %xmm0, %xmm0 ; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpinsrd $1, (%rsp,%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrd $2, (%rsp,%r8,4), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrd $3, (%rsp,%rcx), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrd $1, (%rsp,%r9,4), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrd $3, (%rsp,%rsi,4), %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: movq %rbp, %rsp ; AVX1-NEXT: popq %rbp @@ -1212,28 +1208,24 @@ ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp ; AVX1-NEXT: subq $64, %rsp -; AVX1-NEXT: vpextrq $1, %xmm1, %r8 -; AVX1-NEXT: movq %r8, %rcx -; AVX1-NEXT: shrq $30, %rcx -; AVX1-NEXT: vmovq %xmm1, %r9 -; AVX1-NEXT: movq %r9, %rdx -; AVX1-NEXT: shrq $30, %rdx +; AVX1-NEXT: vmovd %xmm1, %esi +; AVX1-NEXT: vpextrd $1, %xmm1, %r8d +; AVX1-NEXT: vpextrd $2, %xmm1, %r9d +; AVX1-NEXT: vpextrd $3, %xmm1, %r10d ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %r10 -; AVX1-NEXT: movq %r10, %rdi -; AVX1-NEXT: shrq $30, %rdi -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: movq %rax, %rsi -; AVX1-NEXT: shrq $30, %rsi +; AVX1-NEXT: vmovd %xmm1, %edx +; AVX1-NEXT: vpextrd $1, %xmm1, %edi +; AVX1-NEXT: vpextrd $2, %xmm1, %eax +; AVX1-NEXT: vpextrd $3, %xmm1, %ecx ; AVX1-NEXT: vmovaps %ymm0, (%rsp) -; AVX1-NEXT: andl $7, %r9d -; AVX1-NEXT: andl $28, %edx +; AVX1-NEXT: andl $7, %esi ; AVX1-NEXT: andl $7, %r8d -; AVX1-NEXT: andl $28, %ecx -; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: andl $28, %esi +; AVX1-NEXT: andl $7, %r9d ; AVX1-NEXT: andl $7, %r10d -; AVX1-NEXT: andl $28, %edi +; AVX1-NEXT: andl $7, %edx +; AVX1-NEXT: andl $7, %edi +; AVX1-NEXT: andl $7, %eax +; AVX1-NEXT: andl $7, %ecx ; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -1375,36 +1367,32 @@ define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices) unnamed_addr nounwind { ; AVX1-LABEL: var_shuffle_v8i32_from_v4i32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpextrq $1, %xmm1, %r8 -; AVX1-NEXT: movq %r8, %r10 -; AVX1-NEXT: shrq $30, %r10 -; AVX1-NEXT: vmovq %xmm1, %r9 -; AVX1-NEXT: movq %r9, %rsi -; AVX1-NEXT: shrq $30, %rsi +; AVX1-NEXT: vmovd %xmm1, %r8d ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: andl $3, %r9d -; AVX1-NEXT: andl $12, %esi ; AVX1-NEXT: andl $3, %r8d -; AVX1-NEXT: andl $12, %r10d +; AVX1-NEXT: vpextrd $1, %xmm1, %r9d +; AVX1-NEXT: andl $3, %r9d +; AVX1-NEXT: vpextrd $2, %xmm1, %r10d +; AVX1-NEXT: andl $3, %r10d +; AVX1-NEXT: vpextrd $3, %xmm1, %esi +; AVX1-NEXT: andl $3, %esi ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: movq %rax, %rdi -; AVX1-NEXT: shrq $30, %rdi -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rdx -; AVX1-NEXT: shrq $30, %rdx -; AVX1-NEXT: andl $3, %ecx -; AVX1-NEXT: andl $12, %edx +; AVX1-NEXT: vmovd %xmm0, %edi +; AVX1-NEXT: andl $3, %edi +; AVX1-NEXT: vpextrd $1, %xmm0, %eax ; AVX1-NEXT: andl $3, %eax -; AVX1-NEXT: andl $12, %edi +; AVX1-NEXT: vpextrd $2, %xmm0, %ecx +; AVX1-NEXT: andl $3, %ecx +; AVX1-NEXT: vpextrd $3, %xmm0, %edx +; AVX1-NEXT: andl $3, %edx ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rdx), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $2, -24(%rsp,%rax,4), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rax,4), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $2, -24(%rsp,%rcx,4), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rdx,4), %xmm0, %xmm0 ; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrd $2, -24(%rsp,%r8,4), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrd $3, -24(%rsp,%r10), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrd $1, -24(%rsp,%r9,4), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrd $2, -24(%rsp,%r10,4), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rsi,4), %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -2402,28 +2390,24 @@ define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indices) unnamed_addr nounwind { ; AVX1-LABEL: var_shuffle_v8f32_from_v4f32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpextrq $1, %xmm1, %r8 -; AVX1-NEXT: movq %r8, %r10 -; AVX1-NEXT: shrq $30, %r10 -; AVX1-NEXT: vmovq %xmm1, %r9 -; AVX1-NEXT: movq %r9, %rdx -; AVX1-NEXT: shrq $30, %rdx +; AVX1-NEXT: vmovd %xmm1, %r8d ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: andl $3, %r9d -; AVX1-NEXT: andl $12, %edx ; AVX1-NEXT: andl $3, %r8d -; AVX1-NEXT: andl $12, %r10d +; AVX1-NEXT: vpextrd $1, %xmm1, %r9d +; AVX1-NEXT: andl $3, %r9d +; AVX1-NEXT: vpextrd $2, %xmm1, %r10d +; AVX1-NEXT: andl $3, %r10d +; AVX1-NEXT: vpextrd $3, %xmm1, %esi +; AVX1-NEXT: andl $3, %esi ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: movq %rax, %rdi -; AVX1-NEXT: shrq $30, %rdi -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rsi -; AVX1-NEXT: shrq $30, %rsi -; AVX1-NEXT: andl $3, %ecx -; AVX1-NEXT: andl $12, %esi +; AVX1-NEXT: vmovd %xmm0, %edi +; AVX1-NEXT: andl $3, %edi +; AVX1-NEXT: vpextrd $1, %xmm0, %eax ; AVX1-NEXT: andl $3, %eax -; AVX1-NEXT: andl $12, %edi +; AVX1-NEXT: vpextrd $2, %xmm0, %ecx +; AVX1-NEXT: andl $3, %ecx +; AVX1-NEXT: vpextrd $3, %xmm0, %edx +; AVX1-NEXT: andl $3, %edx ; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -2475,19 +2459,17 @@ ; AVX-NEXT: movq %rsp, %rbp ; AVX-NEXT: andq $-32, %rsp ; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: vmovq %xmm1, %rax -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: shrq $30, %rcx -; AVX-NEXT: andl $28, %ecx -; AVX-NEXT: vpextrq $1, %xmm1, %rdx -; AVX-NEXT: movq %rdx, %rsi -; AVX-NEXT: sarq $32, %rsi +; AVX-NEXT: vmovd %xmm1, %eax +; AVX-NEXT: vmovaps %ymm0, (%rsp) ; AVX-NEXT: andl $7, %eax +; AVX-NEXT: vpextrd $1, %xmm1, %ecx +; AVX-NEXT: andl $7, %ecx +; AVX-NEXT: vpextrd $2, %xmm1, %edx ; AVX-NEXT: andl $7, %edx -; AVX-NEXT: vmovaps %ymm0, (%rsp) +; AVX-NEXT: vpextrd $3, %xmm1, %esi ; AVX-NEXT: andl $7, %esi ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vpinsrd $1, (%rsp,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $1, (%rsp,%rcx,4), %xmm0, %xmm0 ; AVX-NEXT: vpinsrd $2, (%rsp,%rdx,4), %xmm0, %xmm0 ; AVX-NEXT: vpinsrd $3, (%rsp,%rsi,4), %xmm0, %xmm0 ; AVX-NEXT: movq %rbp, %rsp