diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34082,6 +34082,14 @@ return DAG.getBitcast(VT, CstOp); } +namespace llvm { + namespace X86 { + enum { + MaxShuffleCombineDepth = 8 + }; + }; +}; // namespace llvm + /// Fully generic combining of x86 shuffle instructions. /// /// This should be the last combine run over the x86 shuffle instructions. Once @@ -34114,16 +34122,15 @@ static SDValue combineX86ShufflesRecursively( ArrayRef SrcOps, int SrcOpIndex, SDValue Root, ArrayRef RootMask, ArrayRef SrcNodes, unsigned Depth, - bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { + unsigned MaxDepth, bool HasVariableMask, bool AllowVariableMask, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"); // Bound the depth of our recursive combine because this is ultimately // quadratic in nature. - const unsigned MaxRecursionDepth = 8; - if (Depth >= MaxRecursionDepth) + if (Depth >= MaxDepth) return SDValue(); // Directly rip through bitcasts to find the underlying operand. @@ -34312,7 +34319,7 @@ // shuffles to avoid constant pool bloat. // Don't recurse if we already have more source ops than we can combine in // the remaining recursion depth. - if (Ops.size() < (MaxRecursionDepth - Depth)) { + if (Ops.size() < (MaxDepth - Depth)) { for (int i = 0, e = Ops.size(); i < e; ++i) { // For empty roots, we need to resolve zeroable elements before combining // them with other shuffles. @@ -34324,7 +34331,7 @@ SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) AllowVar = AllowVariableMask; if (SDValue Res = combineX86ShufflesRecursively( - Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, + Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth, HasVariableMask, AllowVar, DAG, Subtarget)) return Res; } @@ -34370,6 +34377,7 @@ static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0, + X86::MaxShuffleCombineDepth, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget); } @@ -34605,6 +34613,7 @@ DemandedMask[i] = i; if (SDValue Res = combineX86ShufflesRecursively( {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0, + X86::MaxShuffleCombineDepth, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getBitcast(SrcVT, Res)); @@ -35839,16 +35848,22 @@ // If we don't demand all elements, then attempt to combine to a simpler // shuffle. - // TODO: Handle other depths, but first we need to handle the fact that - // it might combine to the same shuffle. - if (!DemandedElts.isAllOnesValue() && Depth == 0) { + // We need to convert the depth to something combineX86ShufflesRecursively + // can handle - so pretend its Depth == 0 again, and reduce the max depth + // to match. This prevents combineX86ShuffleChain from returning a + // combined shuffle that's the same as the original root, causing an + // infinite loop. + if (!DemandedElts.isAllOnesValue()) { + assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range"); + SmallVector DemandedMask(NumElts, SM_SentinelUndef); for (int i = 0; i != NumElts; ++i) if (DemandedElts[i]) DemandedMask[i] = i; SDValue NewShuffle = combineX86ShufflesRecursively( - {Op}, 0, Op, DemandedMask, {}, Depth, /*HasVarMask*/ false, + {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth, + /*HasVarMask*/ false, /*AllowVarMask*/ true, TLO.DAG, Subtarget); if (NewShuffle) return TLO.CombineTo(Op, NewShuffle); @@ -40439,6 +40454,7 @@ if (SDValue Shuffle = combineX86ShufflesRecursively( {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1, + X86::MaxShuffleCombineDepth, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle, N->getOperand(0).getOperand(1)); diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -1909,169 +1909,150 @@ ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: addq %r11, %rbp -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d -; SSE2-NEXT: addq %r10, %r14 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSE2-NEXT: addq %r9, %rbx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d -; SSE2-NEXT: addq %r8, %r11 +; SSE2-NEXT: addq %rbp, %rbx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d -; SSE2-NEXT: addq %rdx, %r10 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSE2-NEXT: addq %rcx, %r8 +; SSE2-NEXT: addq %rdi, %r10 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: addq %rcx, %rbp ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi ; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: addq %r8, %rcx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: addq %r12, %rax ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: addq %rsi, %rdx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: leaq -1(%r15,%rsi), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: leaq -1(%r12,%rsi), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: leaq -1(%r13,%rsi), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: leaq -1(%rax,%rsi), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: leaq -1(%rax,%rsi), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: leaq -1(%rax,%rsi), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: leaq -1(%rax,%rsi), %rsi -; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: leaq -1(%rax,%rsi), %rsi -; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: addq $-1, %rbp +; SSE2-NEXT: leaq -1(%r14,%rdx), %r12 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: leaq -1(%r15,%rdx), %r15 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: leaq -1(%r13,%rdx), %r13 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: leaq -1(%rsi,%rdx), %rdx +; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; SSE2-NEXT: leaq -1(%rsi,%rdx), %rdx +; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: leaq -1(%r11,%rdx), %rdx +; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: leaq -1(%r9,%rdx), %rdx +; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; SSE2-NEXT: leaq -1(%rsi,%rdx), %rdx +; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; SSE2-NEXT: leaq -1(%rsi,%rdx), %rdx +; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; SSE2-NEXT: leaq -1(%rsi,%rdx), %rdx +; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: xorl %r14d, %r14d +; SSE2-NEXT: addq $-1, %rbx ; SSE2-NEXT: movl $0, %r9d ; SSE2-NEXT: adcq $-1, %r9 -; SSE2-NEXT: addq $-1, %r14 +; SSE2-NEXT: addq $-1, %r10 ; SSE2-NEXT: movl $0, %esi ; SSE2-NEXT: adcq $-1, %rsi -; SSE2-NEXT: addq $-1, %rbx -; SSE2-NEXT: movl $0, %eax -; SSE2-NEXT: adcq $-1, %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: addq $-1, %r11 -; SSE2-NEXT: movl $0, %r12d -; SSE2-NEXT: adcq $-1, %r12 -; SSE2-NEXT: addq $-1, %r10 -; SSE2-NEXT: movl $0, %r13d -; SSE2-NEXT: adcq $-1, %r13 -; SSE2-NEXT: addq $-1, %r8 -; SSE2-NEXT: movl $0, %r15d -; SSE2-NEXT: adcq $-1, %r15 +; SSE2-NEXT: addq $-1, %rbp +; SSE2-NEXT: movl $0, %r8d +; SSE2-NEXT: adcq $-1, %r8 ; SSE2-NEXT: addq $-1, %rdi -; SSE2-NEXT: movl $0, %ecx -; SSE2-NEXT: adcq $-1, %rcx -; SSE2-NEXT: addq $-1, %rdx -; SSE2-NEXT: movl $0, %eax -; SSE2-NEXT: adcq $-1, %rax -; SSE2-NEXT: shldq $63, %rdx, %rax -; SSE2-NEXT: shldq $63, %rdi, %rcx -; SSE2-NEXT: movq %rcx, %rdx -; SSE2-NEXT: shldq $63, %r8, %r15 -; SSE2-NEXT: shldq $63, %r10, %r13 -; SSE2-NEXT: shldq $63, %r11, %r12 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; SSE2-NEXT: shldq $63, %rbx, %rdi -; SSE2-NEXT: shldq $63, %r14, %rsi -; SSE2-NEXT: shldq $63, %rbp, %r9 +; SSE2-NEXT: movl $0, %edx +; SSE2-NEXT: adcq $-1, %rdx +; SSE2-NEXT: addq $-1, %rcx +; SSE2-NEXT: movl $0, %r11d +; SSE2-NEXT: adcq $-1, %r11 +; SSE2-NEXT: addq $-1, %rax +; SSE2-NEXT: adcq $-1, %r14 +; SSE2-NEXT: shldq $63, %rax, %r14 +; SSE2-NEXT: shldq $63, %rcx, %r11 +; SSE2-NEXT: shldq $63, %rdi, %rdx +; SSE2-NEXT: shldq $63, %rbp, %r8 +; SSE2-NEXT: shldq $63, %r10, %rsi +; SSE2-NEXT: shldq $63, %rbx, %r9 ; SSE2-NEXT: movq %r9, %xmm8 -; SSE2-NEXT: movq %rsi, %xmm15 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: movq %rcx, %xmm9 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: movq %rcx, %xmm2 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: movq %rcx, %xmm10 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: movq %rcx, %xmm4 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: movq %rcx, %xmm11 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: movq %rcx, %xmm7 -; SSE2-NEXT: movq %rdi, %xmm12 -; SSE2-NEXT: movq %r12, %xmm0 -; SSE2-NEXT: movq %r13, %xmm13 -; SSE2-NEXT: movq %r15, %xmm6 -; SSE2-NEXT: movq %rdx, %xmm14 -; SSE2-NEXT: movq %rax, %xmm5 +; SSE2-NEXT: movq %rsi, %xmm0 +; SSE2-NEXT: movq %r8, %xmm9 +; SSE2-NEXT: movq %rdx, %xmm2 +; SSE2-NEXT: shrq %r12 +; SSE2-NEXT: movq %r12, %xmm10 +; SSE2-NEXT: shrq %r15 +; SSE2-NEXT: movq %r15, %xmm5 +; SSE2-NEXT: shrq %r13 +; SSE2-NEXT: movq %r13, %xmm11 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: shrq %rax +; SSE2-NEXT: movq %rax, %xmm7 +; SSE2-NEXT: movq %r11, %xmm12 +; SSE2-NEXT: movq %r14, %xmm3 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: shrq %rax +; SSE2-NEXT: movq %rax, %xmm13 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm3 +; SSE2-NEXT: movq %rax, %xmm6 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: shrq %rax +; SSE2-NEXT: movq %rax, %xmm14 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: shrq %rax +; SSE2-NEXT: movq %rax, %xmm4 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: shrq %rax +; SSE2-NEXT: movq %rax, %xmm15 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE2-NEXT: shrq %rax ; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm15[0,1,2,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm8 -; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] -; SSE2-NEXT: por %xmm8, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5] +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm8 +; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] -; SSE2-NEXT: pand %xmm8, %xmm7 -; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm7, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,2] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pslld $16, %xmm6 -; SSE2-NEXT: pandn %xmm6, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] -; SSE2-NEXT: psllq $48, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,2,0] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1] +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; SSE2-NEXT: movups %xmm2, (%rax) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] +; SSE2-NEXT: movupd %xmm0, (%rax) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -2482,12 +2463,12 @@ ; AVX2-NEXT: # xmm7 = mem[0],zero ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1] +; AVX2-NEXT: vpbroadcastw %xmm8, %xmm8 ; AVX2-NEXT: vpbroadcastw %xmm9, %xmm0 ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,6],xmm8[7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastw %xmm9, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3] diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -1800,16 +1800,16 @@ ; X86: # %bb.0: ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; X86-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; X86-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; X86-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1817,11 +1817,11 @@ ; ; X64-LABEL: test_mm256_set_ps: ; X64: # %bb.0: -; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; X64-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3] -; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3] +; X64-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; X64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq @@ -2418,26 +2418,26 @@ ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; X86-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X86-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; X86-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] +; X86-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; X86-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_setr_ps: ; X64: # %bb.0: -; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] -; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] +; X64-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; X64-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] ; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] -; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; X64-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; X64-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx-trunc.ll b/llvm/test/CodeGen/X86/avx-trunc.ll --- a/llvm/test/CodeGen/X86/avx-trunc.ll +++ b/llvm/test/CodeGen/X86/avx-trunc.ll @@ -16,9 +16,8 @@ ; CHECK-LABEL: trunc_32_16: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll --- a/llvm/test/CodeGen/X86/avx512-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll @@ -156,28 +156,45 @@ } define <4 x float> @slto4f32_mem(<4 x i64>* %a) { -; NODQ-LABEL: slto4f32_mem: -; NODQ: # %bb.0: -; NODQ-NEXT: vmovdqu (%rdi), %xmm0 -; NODQ-NEXT: vmovdqu 16(%rdi), %xmm1 -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; NODQ-NEXT: retq +; NOVLDQ-LABEL: slto4f32_mem: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vmovdqu (%rdi), %xmm0 +; NOVLDQ-NEXT: vmovdqu 16(%rdi), %xmm1 +; NOVLDQ-NEXT: vpextrq $1, %xmm0, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; NOVLDQ-NEXT: vmovq %xmm0, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; NOVLDQ-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; NOVLDQ-NEXT: vmovq %xmm1, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; NOVLDQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm1, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; NOVLDQ-NEXT: retq ; ; VLDQ-LABEL: slto4f32_mem: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0 ; VLDQ-NEXT: retq ; +; VLNODQ-LABEL: slto4f32_mem: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vmovdqu (%rdi), %xmm0 +; VLNODQ-NEXT: vmovdqu 16(%rdi), %xmm1 +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; VLNODQ-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; VLNODQ-NEXT: vmovq %xmm1, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; VLNODQ-NEXT: vpextrq $1, %xmm1, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; VLNODQ-NEXT: retq +; ; DQNOVL-LABEL: slto4f32_mem: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vmovups (%rdi), %ymm0 @@ -259,22 +276,22 @@ } define <4 x float> @slto4f32(<4 x i64> %a) { -; NODQ-LABEL: slto4f32: -; NODQ: # %bb.0: -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; NODQ-NEXT: vzeroupper -; NODQ-NEXT: retq +; NOVLDQ-LABEL: slto4f32: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vpextrq $1, %xmm0, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; NOVLDQ-NEXT: vmovq %xmm0, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; NOVLDQ-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; NOVLDQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NOVLDQ-NEXT: vmovq %xmm0, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; NOVLDQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm0, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq ; ; VLDQ-LABEL: slto4f32: ; VLDQ: # %bb.0: @@ -282,6 +299,23 @@ ; VLDQ-NEXT: vzeroupper ; VLDQ-NEXT: retq ; +; VLNODQ-LABEL: slto4f32: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; VLNODQ-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; VLNODQ-NEXT: vzeroupper +; VLNODQ-NEXT: retq +; ; DQNOVL-LABEL: slto4f32: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 @@ -294,22 +328,22 @@ } define <4 x float> @ulto4f32(<4 x i64> %a) { -; NODQ-LABEL: ulto4f32: -; NODQ: # %bb.0: -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; NODQ-NEXT: vzeroupper -; NODQ-NEXT: retq +; NOVLDQ-LABEL: ulto4f32: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vpextrq $1, %xmm0, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; NOVLDQ-NEXT: vmovq %xmm0, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; NOVLDQ-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; NOVLDQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NOVLDQ-NEXT: vmovq %xmm0, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 +; NOVLDQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm0, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq ; ; VLDQ-LABEL: ulto4f32: ; VLDQ: # %bb.0: @@ -317,6 +351,23 @@ ; VLDQ-NEXT: vzeroupper ; VLDQ-NEXT: retq ; +; VLNODQ-LABEL: ulto4f32: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; VLNODQ-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; VLNODQ-NEXT: vzeroupper +; VLNODQ-NEXT: retq +; ; DQNOVL-LABEL: ulto4f32: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 @@ -968,41 +1019,71 @@ } define <8 x float> @slto8f32(<8 x i64> %a) { -; NODQ-LABEL: slto8f32: -; NODQ: # %bb.0: -; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; NODQ-NEXT: retq +; NOVLDQ-LABEL: slto8f32: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NOVLDQ-NEXT: vpextrq $1, %xmm1, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; NOVLDQ-NEXT: vmovq %xmm1, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; NOVLDQ-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; NOVLDQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NOVLDQ-NEXT: vmovq %xmm2, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; NOVLDQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm2, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm0, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; NOVLDQ-NEXT: vmovq %xmm0, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; NOVLDQ-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; NOVLDQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NOVLDQ-NEXT: vmovq %xmm0, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; NOVLDQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm0, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; NOVLDQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; NOVLDQ-NEXT: retq ; ; VLDQ-LABEL: slto8f32: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; VLDQ-NEXT: retq ; +; VLNODQ-LABEL: slto8f32: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; VLNODQ-NEXT: vpextrq $1, %xmm1, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; VLNODQ-NEXT: vmovq %xmm1, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; VLNODQ-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; VLNODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; VLNODQ-NEXT: vmovq %xmm2, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; VLNODQ-NEXT: vpextrq $1, %xmm2, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; VLNODQ-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; VLNODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; VLNODQ-NEXT: retq +; ; DQNOVL-LABEL: slto8f32: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 @@ -1012,62 +1093,62 @@ } define <16 x float> @slto16f32(<16 x i64> %a) { -; NODQ-LABEL: slto16f32: -; NODQ: # %bb.0: -; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1 -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; NODQ-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; NODQ-NEXT: vextractf32x4 $3, %zmm0, %xmm3 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; NODQ-NEXT: retq +; NOVLDQ-LABEL: slto16f32: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; NOVLDQ-NEXT: vpextrq $1, %xmm2, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; NOVLDQ-NEXT: vmovq %xmm2, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; NOVLDQ-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; NOVLDQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 +; NOVLDQ-NEXT: vmovq %xmm3, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 +; NOVLDQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm3, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm1, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NOVLDQ-NEXT: vmovq %xmm1, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; NOVLDQ-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; NOVLDQ-NEXT: vextracti128 $1, %ymm1, %xmm1 +; NOVLDQ-NEXT: vmovq %xmm1, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; NOVLDQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm1, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] +; NOVLDQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOVLDQ-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; NOVLDQ-NEXT: vpextrq $1, %xmm2, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NOVLDQ-NEXT: vmovq %xmm2, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 +; NOVLDQ-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; NOVLDQ-NEXT: vextractf32x4 $3, %zmm0, %xmm3 +; NOVLDQ-NEXT: vmovq %xmm3, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; NOVLDQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm3, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm0, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NOVLDQ-NEXT: vmovq %xmm0, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; NOVLDQ-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; NOVLDQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NOVLDQ-NEXT: vmovq %xmm0, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; NOVLDQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm0, %rax +; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 +; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] +; NOVLDQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; NOVLDQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; NOVLDQ-NEXT: retq ; ; VLDQ-LABEL: slto16f32: ; VLDQ: # %bb.0: @@ -1076,6 +1157,63 @@ ; VLDQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; VLDQ-NEXT: retq ; +; VLNODQ-LABEL: slto16f32: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; VLNODQ-NEXT: vpextrq $1, %xmm2, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; VLNODQ-NEXT: vmovq %xmm2, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; VLNODQ-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; VLNODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 +; VLNODQ-NEXT: vmovq %xmm3, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; VLNODQ-NEXT: vpextrq $1, %xmm1, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; VLNODQ-NEXT: vmovq %xmm1, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; VLNODQ-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; VLNODQ-NEXT: vextracti128 $1, %ymm1, %xmm1 +; VLNODQ-NEXT: vmovq %xmm1, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; VLNODQ-NEXT: vpextrq $1, %xmm1, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] +; VLNODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; VLNODQ-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; VLNODQ-NEXT: vpextrq $1, %xmm2, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; VLNODQ-NEXT: vmovq %xmm2, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 +; VLNODQ-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; VLNODQ-NEXT: vextractf32x4 $3, %zmm0, %xmm3 +; VLNODQ-NEXT: vmovq %xmm3, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; VLNODQ-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] +; VLNODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; VLNODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; VLNODQ-NEXT: retq +; ; DQNOVL-LABEL: slto16f32: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 @@ -1203,41 +1341,71 @@ } define <8 x float> @ulto8f32(<8 x i64> %a) { -; NODQ-LABEL: ulto8f32: -; NODQ: # %bb.0: -; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; NODQ-NEXT: retq +; NOVLDQ-LABEL: ulto8f32: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NOVLDQ-NEXT: vpextrq $1, %xmm1, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; NOVLDQ-NEXT: vmovq %xmm1, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1 +; NOVLDQ-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; NOVLDQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NOVLDQ-NEXT: vmovq %xmm2, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 +; NOVLDQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm2, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 +; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm0, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 +; NOVLDQ-NEXT: vmovq %xmm0, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 +; NOVLDQ-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; NOVLDQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NOVLDQ-NEXT: vmovq %xmm0, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 +; NOVLDQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm0, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm0 +; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; NOVLDQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; NOVLDQ-NEXT: retq ; ; VLDQ-LABEL: ulto8f32: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; VLDQ-NEXT: retq ; +; VLNODQ-LABEL: ulto8f32: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; VLNODQ-NEXT: vpextrq $1, %xmm1, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; VLNODQ-NEXT: vmovq %xmm1, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1 +; VLNODQ-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; VLNODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; VLNODQ-NEXT: vmovq %xmm2, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; VLNODQ-NEXT: vpextrq $1, %xmm2, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 +; VLNODQ-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm0 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; VLNODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; VLNODQ-NEXT: retq +; ; DQNOVL-LABEL: ulto8f32: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvtuqq2ps %zmm0, %ymm0 @@ -1247,62 +1415,62 @@ } define <16 x float> @ulto16f32(<16 x i64> %a) { -; NODQ-LABEL: ulto16f32: -; NODQ: # %bb.0: -; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1 -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; NODQ-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; NODQ-NEXT: vextractf32x4 $3, %zmm0, %xmm3 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; NODQ-NEXT: retq +; NOVLDQ-LABEL: ulto16f32: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; NOVLDQ-NEXT: vpextrq $1, %xmm2, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 +; NOVLDQ-NEXT: vmovq %xmm2, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 +; NOVLDQ-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; NOVLDQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 +; NOVLDQ-NEXT: vmovq %xmm3, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4 +; NOVLDQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm3, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm1, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NOVLDQ-NEXT: vmovq %xmm1, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; NOVLDQ-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; NOVLDQ-NEXT: vextracti128 $1, %ymm1, %xmm1 +; NOVLDQ-NEXT: vmovq %xmm1, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; NOVLDQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm1, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1 +; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] +; NOVLDQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOVLDQ-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; NOVLDQ-NEXT: vpextrq $1, %xmm2, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NOVLDQ-NEXT: vmovq %xmm2, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2 +; NOVLDQ-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; NOVLDQ-NEXT: vextractf32x4 $3, %zmm0, %xmm3 +; NOVLDQ-NEXT: vmovq %xmm3, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; NOVLDQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm3, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm0, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NOVLDQ-NEXT: vmovq %xmm0, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; NOVLDQ-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; NOVLDQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NOVLDQ-NEXT: vmovq %xmm0, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; NOVLDQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NOVLDQ-NEXT: vpextrq $1, %xmm0, %rax +; NOVLDQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0 +; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] +; NOVLDQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; NOVLDQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; NOVLDQ-NEXT: retq ; ; VLDQ-LABEL: ulto16f32: ; VLDQ: # %bb.0: @@ -1311,6 +1479,63 @@ ; VLDQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; VLDQ-NEXT: retq ; +; VLNODQ-LABEL: ulto16f32: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; VLNODQ-NEXT: vpextrq $1, %xmm2, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 +; VLNODQ-NEXT: vmovq %xmm2, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 +; VLNODQ-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; VLNODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 +; VLNODQ-NEXT: vmovq %xmm3, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; VLNODQ-NEXT: vpextrq $1, %xmm1, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; VLNODQ-NEXT: vmovq %xmm1, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; VLNODQ-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; VLNODQ-NEXT: vextracti128 $1, %ymm1, %xmm1 +; VLNODQ-NEXT: vmovq %xmm1, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; VLNODQ-NEXT: vpextrq $1, %xmm1, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] +; VLNODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; VLNODQ-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; VLNODQ-NEXT: vpextrq $1, %xmm2, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; VLNODQ-NEXT: vmovq %xmm2, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2 +; VLNODQ-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; VLNODQ-NEXT: vextractf32x4 $3, %zmm0, %xmm3 +; VLNODQ-NEXT: vmovq %xmm3, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; VLNODQ-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] +; VLNODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; VLNODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; VLNODQ-NEXT: retq +; ; DQNOVL-LABEL: ulto16f32: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvtuqq2ps %zmm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -1791,9 +1791,11 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [12,9,4,10] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,1,0,2] +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT: vpermps %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -182,11 +182,11 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax @@ -249,10 +249,10 @@ ; SSE2-SSSE3-LABEL: v2i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll @@ -125,32 +125,32 @@ ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: cmpltpd %xmm3, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[0,1,0,2,4,5,6,7] ; SSE-NEXT: cmpltpd %xmm2, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,0,2,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: cmpltpd %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: cmpltpd %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,1,0,2,4,5,6,7] ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,1,0,2,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,2,2,3,4,5,6,7] ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll --- a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll @@ -148,7 +148,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: movmskpd %xmm0, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax @@ -195,7 +195,7 @@ ; SSE2-SSSE3-LABEL: v2i16: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -81,11 +81,11 @@ ; AVX512-NEXT: movzbl %al, %ecx ; AVX512-NEXT: shrl $2, %ecx ; AVX512-NEXT: andl $3, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vpbroadcastq %rcx, %xmm0 ; AVX512-NEXT: andl $3, %eax ; AVX512-NEXT: vmovd %eax, %xmm1 ; AVX512-NEXT: vpextrb $0, %xmm1, %ecx -; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: vpextrb $8, %xmm0, %eax ; AVX512-NEXT: addb %cl, %al ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq @@ -134,11 +134,11 @@ ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movzbl %al, %ecx ; AVX512-NEXT: shrl $4, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vpbroadcastq %rcx, %xmm0 ; AVX512-NEXT: andl $15, %eax ; AVX512-NEXT: vmovd %eax, %xmm1 ; AVX512-NEXT: vpextrb $0, %xmm1, %ecx -; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: vpextrb $8, %xmm0, %eax ; AVX512-NEXT: addb %cl, %al ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq @@ -231,11 +231,11 @@ ; AVX512-NEXT: movzbl %al, %ecx ; AVX512-NEXT: shrl $2, %ecx ; AVX512-NEXT: andl $3, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vpbroadcastq %rcx, %xmm0 ; AVX512-NEXT: andl $3, %eax ; AVX512-NEXT: vmovd %eax, %xmm1 ; AVX512-NEXT: vpextrb $0, %xmm1, %ecx -; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: vpextrb $8, %xmm0, %eax ; AVX512-NEXT: addb %cl, %al ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper @@ -287,11 +287,11 @@ ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movzbl %al, %ecx ; AVX512-NEXT: shrl $4, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vpbroadcastq %rcx, %xmm0 ; AVX512-NEXT: andl $15, %eax ; AVX512-NEXT: vmovd %eax, %xmm1 ; AVX512-NEXT: vpextrb $0, %xmm1, %ecx -; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: vpextrb $8, %xmm0, %eax ; AVX512-NEXT: addb %cl, %al ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper @@ -492,11 +492,11 @@ ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movzbl %al, %ecx ; AVX512-NEXT: shrl $4, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vpbroadcastq %rcx, %xmm0 ; AVX512-NEXT: andl $15, %eax ; AVX512-NEXT: vmovd %eax, %xmm1 ; AVX512-NEXT: vpextrb $0, %xmm1, %ecx -; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: vpextrb $8, %xmm0, %eax ; AVX512-NEXT: addb %cl, %al ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/build-vector-128.ll b/llvm/test/CodeGen/X86/build-vector-128.ll --- a/llvm/test/CodeGen/X86/build-vector-128.ll +++ b/llvm/test/CodeGen/X86/build-vector-128.ll @@ -48,8 +48,8 @@ ; ; SSE41-64-LABEL: test_buildvector_v4f32: ; SSE41-64: # %bb.0: -; SSE41-64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; SSE41-64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE41-64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE41-64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; SSE41-64-NEXT: retq ; @@ -60,8 +60,8 @@ ; ; AVX-64-LABEL: test_buildvector_v4f32: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX-64-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; AVX-64-NEXT: retq %ins0 = insertelement <4 x float> undef, float %a0, i32 0 @@ -531,7 +531,7 @@ ; ; SSE41-64-LABEL: PR37502: ; SSE41-64: # %bb.0: -; SSE41-64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE41-64-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-64-NEXT: retq ; @@ -540,17 +540,11 @@ ; AVX-32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX-32-NEXT: retl ; -; AVX1-64-LABEL: PR37502: -; AVX1-64: # %bb.0: -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX1-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX1-64-NEXT: retq -; -; AVX2-64-LABEL: PR37502: -; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX2-64-NEXT: retq +; AVX-64-LABEL: PR37502: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX-64-NEXT: retq %i0 = insertelement <4 x float> undef, float %x, i32 0 %i1 = insertelement <4 x float> %i0, float %y, i32 1 %i2 = insertelement <4 x float> %i1, float %x, i32 2 diff --git a/llvm/test/CodeGen/X86/build-vector-256.ll b/llvm/test/CodeGen/X86/build-vector-256.ll --- a/llvm/test/CodeGen/X86/build-vector-256.ll +++ b/llvm/test/CodeGen/X86/build-vector-256.ll @@ -31,11 +31,11 @@ ; ; AVX-64-LABEL: test_buildvector_v8f32: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] -; AVX-64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] +; AVX-64-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX-64-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] ; AVX-64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] -; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX-64-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; AVX-64-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX-64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/build-vector-512.ll b/llvm/test/CodeGen/X86/build-vector-512.ll --- a/llvm/test/CodeGen/X86/build-vector-512.ll +++ b/llvm/test/CodeGen/X86/build-vector-512.ll @@ -39,23 +39,27 @@ ; ; AVX-64-LABEL: test_buildvector_v16f32: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] -; AVX-64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] +; AVX-64-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero +; AVX-64-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero +; AVX-64-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX-64-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero +; AVX-64-NEXT: vmovlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0] +; AVX-64-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1,2],mem[0] +; AVX-64-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero +; AVX-64-NEXT: vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero +; AVX-64-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX-64-NEXT: vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero +; AVX-64-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; AVX-64-NEXT: vinsertps {{.*#+}} xmm9 = xmm9[0,1,2],mem[0] +; AVX-64-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX-64-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX-64-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] ; AVX-64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] -; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX-64-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; AVX-64-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX-64-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; AVX-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; AVX-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; AVX-64-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] -; AVX-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; AVX-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] -; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX-64-NEXT: vinsertf64x4 $1, %ymm8, %zmm0, %zmm0 ; AVX-64-NEXT: retq %ins0 = insertelement <16 x float> undef, float %a0, i32 0 %ins1 = insertelement <16 x float> %ins0, float %a1, i32 1 diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll --- a/llvm/test/CodeGen/X86/buildvec-extract.ll +++ b/llvm/test/CodeGen/X86/buildvec-extract.ll @@ -404,13 +404,13 @@ define <2 x i64> @extract1_i16_zext_insert0_i64_undef(<8 x i16> %x) { ; SSE-LABEL: extract1_i16_zext_insert0_i64_undef: ; SSE: # %bb.0: -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: extract1_i16_zext_insert0_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 1 @@ -647,8 +647,7 @@ define <2 x i64> @extract3_i16_zext_insert1_i64_undef(<8 x i16> %x) { ; SSE2-LABEL: extract3_i16_zext_insert1_i64_undef: ; SSE2: # %bb.0: -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlq $48, %xmm0 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -44,12 +44,11 @@ define <4 x float> @test_negative_zero_1(<4 x float> %A) { ; SSE2-LABEL: test_negative_zero_1: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq ; @@ -96,8 +95,8 @@ ; ; SSE41-LABEL: test_buildvector_v4f32_register: ; SSE41: # %bb.0: -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; SSE41-NEXT: retq %ins0 = insertelement <4 x float> undef, float %f0, i32 0 @@ -121,9 +120,11 @@ ; ; SSE41-LABEL: test_buildvector_v4f32_load: ; SSE41: # %bb.0: +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE41-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; SSE41-NEXT: retq %f0 = load float, float* %p0, align 4 @@ -148,8 +149,8 @@ ; ; SSE41-LABEL: test_buildvector_v4f32_partial_load: ; SSE41: # %bb.0: -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; SSE41-NEXT: retq %f3 = load float, float* %p3, align 4 diff --git a/llvm/test/CodeGen/X86/combine-fcopysign.ll b/llvm/test/CodeGen/X86/combine-fcopysign.ll --- a/llvm/test/CodeGen/X86/combine-fcopysign.ll +++ b/llvm/test/CodeGen/X86/combine-fcopysign.ll @@ -266,7 +266,7 @@ ; SSE-NEXT: andps {{.*}}(%rip), %xmm1 ; SSE-NEXT: orps %xmm5, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],xmm6[0],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE-NEXT: andps %xmm4, %xmm0 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -1981,28 +1981,27 @@ ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrld $28, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrld $29, %xmm3 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE2-NEXT: psrld $30, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $4, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrad $3, %xmm3 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE2-NEXT: psrad $2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3] -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: psubd %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psrld $30, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrad $2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psrad $4, %xmm3 +; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: psubd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE2-NEXT: psrad $3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -414,10 +414,10 @@ ; SSE2-LABEL: combine_vec_shl_ge_ashr_extact1: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $5, %xmm1 +; SSE2-NEXT: psrad $3, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $3, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] +; SSE2-NEXT: psrad $5, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $8, %xmm1 ; SSE2-NEXT: psrad $4, %xmm0 @@ -476,10 +476,10 @@ ; SSE2-LABEL: combine_vec_shl_lt_ashr_extact1: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $7, %xmm1 +; SSE2-NEXT: psrad $5, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $5, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] +; SSE2-NEXT: psrad $7, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $8, %xmm1 ; SSE2-NEXT: psrad $6, %xmm0 @@ -541,10 +541,10 @@ ; SSE2-LABEL: combine_vec_shl_gt_lshr1: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $5, %xmm1 +; SSE2-NEXT: psrld $3, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld $3, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] +; SSE2-NEXT: psrld $5, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $8, %xmm1 ; SSE2-NEXT: psrld $4, %xmm0 @@ -606,10 +606,10 @@ ; SSE2-LABEL: combine_vec_shl_le_lshr1: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $7, %xmm1 +; SSE2-NEXT: psrld $5, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld $5, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] +; SSE2-NEXT: psrld $7, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $8, %xmm1 ; SSE2-NEXT: psrld $6, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -217,8 +217,7 @@ ; ; AVX2-FAST-LABEL: combine_vec_ashr_trunc_lshr: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -590,7 +590,7 @@ ; X86-NEXT: movdqa %xmm4, (%ecx) ; X86-NEXT: pmuludq %xmm1, %xmm4 ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; X86-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm2[0,0] +; X86-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-NEXT: pmuludq %xmm5, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -633,7 +633,7 @@ ; X64-NEXT: movdqa %xmm4, (%rdi) ; X64-NEXT: pmuludq %xmm1, %xmm4 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; X64-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm2[0,0] +; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X64-NEXT: pmuludq %xmm5, %xmm1 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -590,7 +590,7 @@ ; X86-NEXT: movdqa %xmm4, (%ecx) ; X86-NEXT: pmuludq %xmm1, %xmm4 ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; X86-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm2[0,0] +; X86-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-NEXT: pmuludq %xmm5, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -633,7 +633,7 @@ ; X64-NEXT: movdqa %xmm4, (%rdi) ; X64-NEXT: pmuludq %xmm1, %xmm4 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; X64-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm2[0,0] +; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X64-NEXT: pmuludq %xmm5, %xmm1 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll --- a/llvm/test/CodeGen/X86/fma.ll +++ b/llvm/test/CodeGen/X86/fma.ll @@ -356,19 +356,6 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x30] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x20] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] -; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x54] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] @@ -378,6 +365,19 @@ ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x54] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload @@ -387,22 +387,26 @@ ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x54] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x1c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x18] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x18,0x10] -; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x14,0x20] -; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x1c] +; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x14] +; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vunpcklps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x14,0xc0] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FMACALL32_BDVER2-NEXT: vmovlhps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc2] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],xmm2[0] ; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x10,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: addl $108, %esp ## encoding: [0x83,0xc4,0x6c] @@ -616,19 +620,6 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xb4,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x50] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x40] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] -; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xa8,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] @@ -639,6 +630,19 @@ ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xa8,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x9c,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] @@ -670,13 +674,13 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x84,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 @@ -684,13 +688,13 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x78] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 @@ -703,10 +707,10 @@ ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x78] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x84,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34] @@ -715,30 +719,38 @@ ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x9c,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xa8,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xb4,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x38] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x10] -; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm3 ## encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x24] +; FMACALL32_BDVER2-NEXT: ## xmm3 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vunpcklps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x14,0xc0] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x34] +; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc1] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],xmm1[0] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x28] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10] -; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x34,0x20] -; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20] -; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] ; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x30,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] +; FMACALL32_BDVER2-NEXT: vunpcklps %xmm1, %xmm2, %xmm1 ## encoding: [0xc5,0xe8,0x14,0xc9] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; FMACALL32_BDVER2-NEXT: vmovlhps %xmm3, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0xcb] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],xmm3[0] ; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] @@ -1073,15 +1085,15 @@ ; FMACALL32_BDVER2-NEXT: ## imm = 0x1C0 ; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm4 ## encoding: [0xc5,0xf8,0x28,0x65,0x38] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x4c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc9,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x5c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x9c,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0xc0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02] @@ -1092,8 +1104,20 @@ ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x54,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x48,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] @@ -1104,27 +1128,15 @@ ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38] -; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x48,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] -; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x3c,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] @@ -1134,10 +1146,10 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x30,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] @@ -1145,26 +1157,26 @@ ; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x24,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x18,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] -; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 @@ -1173,13 +1185,13 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x0c,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60] @@ -1199,8 +1211,20 @@ ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xf4,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xe8,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0x80,0x00,0x00,0x00] @@ -1211,27 +1235,15 @@ ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] -; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xe8,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] -; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] -; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] @@ -1241,10 +1253,10 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] @@ -1252,26 +1264,26 @@ ; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xdc,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xd0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 @@ -1283,10 +1295,10 @@ ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xd0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xdc,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34] @@ -1295,10 +1307,10 @@ ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x80,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xe8,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xf4,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] @@ -1307,10 +1319,10 @@ ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x0c,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x5c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x58] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x18,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x58] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x5c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x24,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x54] @@ -1319,49 +1331,65 @@ ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x3c,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x48,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x54,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x44] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x38] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x10] -; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x3c] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10] -; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x4c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x2c] ; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x48,0x10] -; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0],mem[0],xmm2[2,3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x34,0x20] -; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20] -; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x44,0x20] -; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1],mem[0],xmm2[3] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm3 ## encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x4c] +; FMACALL32_BDVER2-NEXT: ## xmm3 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm4 ## encoding: [0xc5,0xfa,0x10,0x64,0x24,0x44] +; FMACALL32_BDVER2-NEXT: ## xmm4 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vunpcklps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x14,0xc0] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x34] +; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc1] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],xmm1[0] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero ; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x30,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] +; FMACALL32_BDVER2-NEXT: vunpcklps %xmm1, %xmm2, %xmm1 ## encoding: [0xc5,0xe8,0x14,0xc9] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x24] +; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vmovlhps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0xca] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],xmm2[0] ; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x40,0x30] -; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1,2],mem[0] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x5c] +; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero,zero,zero ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x5c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x58] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x58,0x10] -; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x54,0x20] -; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] +; FMACALL32_BDVER2-NEXT: vunpcklps %xmm1, %xmm2, %xmm1 ## encoding: [0xc5,0xe8,0x14,0xc9] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x54] +; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vmovlhps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0xca] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],xmm2[0] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x48] +; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero,zero,zero ; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x50,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] +; FMACALL32_BDVER2-NEXT: vunpcklps %xmm2, %xmm3, %xmm2 ## encoding: [0xc5,0xe0,0x14,0xd2] +; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; FMACALL32_BDVER2-NEXT: vmovlhps %xmm4, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x16,0xd4] +; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0],xmm4[0] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x40,0x30] +; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01] ; FMACALL32_BDVER2-NEXT: movl %ebp, %esp ## encoding: [0x89,0xec] ; FMACALL32_BDVER2-NEXT: popl %ebp ## encoding: [0x5d] diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -909,13 +909,13 @@ ; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX-NEXT: vsubss %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] ; AVX-NEXT: retq %vecext = extractelement <4 x float> %A, i32 2 diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -380,11 +380,11 @@ ; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] ; AVX512-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; AVX512-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX512-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm2 -; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -420,8 +420,8 @@ ; BWON-F16C-NEXT: movswl 2(%rdi), %eax ; BWON-F16C-NEXT: vmovd %eax, %xmm3 ; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3 -; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; BWON-F16C-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; BWON-F16C-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; BWON-F16C-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll --- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll +++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll @@ -85,9 +85,9 @@ define <4 x i32> @elt3_v4i32(i32 %x) { ; X32SSE2-LABEL: elt3_v4i32: ; X32SSE2: # %bb.0: -; X32SSE2-NEXT: movaps {{.*#+}} xmm0 = <42,1,2,u> ; X32SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; X32SSE2-NEXT: movaps {{.*#+}} xmm0 = <42,1,2,u> +; X32SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; X32SSE2-NEXT: retl ; @@ -95,7 +95,7 @@ ; X64SSE2: # %bb.0: ; X64SSE2-NEXT: movd %edi, %xmm1 ; X64SSE2-NEXT: movaps {{.*#+}} xmm0 = <42,1,2,u> -; X64SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; X64SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; X64SSE2-NEXT: retq ; @@ -170,14 +170,14 @@ ; X32SSE2: # %bb.0: ; X32SSE2-NEXT: movaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0> ; X32SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; X32SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; X32SSE2-NEXT: retl ; ; X64SSE2-LABEL: elt1_v4f32: ; X64SSE2: # %bb.0: ; X64SSE2-NEXT: movaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0> -; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; X64SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; X64SSE2-NEXT: retq ; @@ -241,9 +241,9 @@ define <8 x i32> @elt7_v8i32(i32 %x) { ; X32SSE2-LABEL: elt7_v8i32: ; X32SSE2: # %bb.0: -; X32SSE2-NEXT: movaps {{.*#+}} xmm1 = <4,5,6,u> ; X32SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; X32SSE2-NEXT: movaps {{.*#+}} xmm1 = <4,5,6,u> +; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X32SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; X32SSE2-NEXT: movaps {{.*#+}} xmm0 = [42,1,2,3] ; X32SSE2-NEXT: retl @@ -252,7 +252,7 @@ ; X64SSE2: # %bb.0: ; X64SSE2-NEXT: movd %edi, %xmm0 ; X64SSE2-NEXT: movaps {{.*#+}} xmm1 = <4,5,6,u> -; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X64SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; X64SSE2-NEXT: movaps {{.*#+}} xmm0 = [42,1,2,3] ; X64SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/insertelement-shuffle.ll b/llvm/test/CodeGen/X86/insertelement-shuffle.ll --- a/llvm/test/CodeGen/X86/insertelement-shuffle.ll +++ b/llvm/test/CodeGen/X86/insertelement-shuffle.ll @@ -9,7 +9,7 @@ ; X86: # %bb.0: ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X86-NEXT: vpbroadcastd %xmm1, %xmm1 ; X86-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] ; X86-NEXT: retl ; @@ -17,7 +17,7 @@ ; X64: # %bb.0: ; X64-NEXT: vmovd %edi, %xmm1 ; X64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X64-NEXT: vpbroadcastd %xmm1, %xmm1 ; X64-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] ; X64-NEXT: retq %ins1 = insertelement <2 x i16> undef, i16 %x0, i32 0 diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -288,7 +288,6 @@ ret <2 x double> %3 } -; TODO: Fix vpshufd+vpsrlq -> vpshufd/vpermilps define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind { ; X86-LABEL: signbits_ashr_concat_ashr_extract_sitofp: ; X86: # %bb.0: @@ -298,8 +297,7 @@ ; ; X64-LABEL: signbits_ashr_concat_ashr_extract_sitofp: ; X64: # %bb.0: -; X64-NEXT: vpsrlq $32, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X64-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X64-NEXT: retq %1 = ashr <2 x i64> %a0, diff --git a/llvm/test/CodeGen/X86/load-partial.ll b/llvm/test/CodeGen/X86/load-partial.ll --- a/llvm/test/CodeGen/X86/load-partial.ll +++ b/llvm/test/CodeGen/X86/load-partial.ll @@ -232,15 +232,17 @@ ; ; SSE41-LABEL: load_float4_float3_trunc_0123: ; SSE41: # %bb.0: +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE41-NEXT: movaps (%rdi), %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; SSE41-NEXT: retq ; ; AVX-LABEL: load_float4_float3_trunc_0123: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovaps (%rdi), %xmm1 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; AVX-NEXT: retq %2 = bitcast <4 x float>* %0 to i64* @@ -346,57 +348,40 @@ } define void @PR43227(i32* %explicit_0, <8 x i32>* %explicit_1) { -; SSE2-LABEL: PR43227: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: movaps %xmm1, 672(%rsi) -; SSE2-NEXT: movaps %xmm2, 688(%rsi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: PR43227: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: xorps %xmm2, %xmm2 -; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSSE3-NEXT: movaps %xmm1, 672(%rsi) -; SSSE3-NEXT: movaps %xmm2, 688(%rsi) -; SSSE3-NEXT: retq +; SSE-LABEL: PR43227: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: psrlq $32, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm1, 672(%rsi) +; SSE-NEXT: movdqa %xmm0, 688(%rsi) +; SSE-NEXT: retq ; -; SSE41-LABEL: PR43227: -; SSE41: # %bb.0: -; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE41-NEXT: movdqa %xmm1, 672(%rsi) -; SSE41-NEXT: movdqa %xmm0, 688(%rsi) -; SSE41-NEXT: retq +; AVX1-LABEL: PR43227: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, 672(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq ; -; AVX-LABEL: PR43227: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vmovaps %ymm0, 672(%rsi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX2-LABEL: PR43227: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 672(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %1 = getelementptr i32, i32* %explicit_0, i64 63 %2 = bitcast i32* %1 to <3 x i32>* %3 = load <3 x i32>, <3 x i32>* %2, align 1 diff --git a/llvm/test/CodeGen/X86/load-slice.ll b/llvm/test/CodeGen/X86/load-slice.ll --- a/llvm/test/CodeGen/X86/load-slice.ll +++ b/llvm/test/CodeGen/X86/load-slice.ll @@ -26,7 +26,7 @@ ; Add high slice: out[out_start].imm, this is base + 4. ; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]] ; Swap Imm and Real. -; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]] +; STRESS-NEXT: vunpcklps [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]] ; Put the results back into out[out_start]. ; STRESS-NEXT: vmovlps [[RES_Vec]], ([[BASE]]) ; @@ -41,7 +41,7 @@ ; Add high slice: out[out_start].imm, this is base + 4. ; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]] ; Swap Imm and Real. -; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]] +; REGULAR-NEXT: vunpcklps [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]] ; Put the results back into out[out_start]. ; REGULAR-NEXT: vmovlps [[RES_Vec]], ([[BASE]]) define void @t1(%class.Complex* nocapture %out, i64 %out_start) { diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -1136,7 +1136,7 @@ ; SSE2-NEXT: je LBB4_4 ; SSE2-NEXT: LBB4_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -1224,7 +1224,7 @@ ; SSE2: ## %bb.0: ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,2] @@ -1234,7 +1234,7 @@ ; SSE42-LABEL: expandload_v4f32_const: ; SSE42: ## %bb.0: ; SSE42-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; SSE42-NEXT: retq @@ -1242,7 +1242,7 @@ ; AVX1OR2-LABEL: expandload_v4f32_const: ; AVX1OR2: ## %bb.0: ; AVX1OR2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX1OR2-NEXT: retq @@ -1281,13 +1281,13 @@ ; SSE2-NEXT: movups 16(%rdi), %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm5 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm5[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE2-NEXT: movss {{.*#+}} xmm5 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm2[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[0,2] ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm2[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[0,2] @@ -1297,14 +1297,20 @@ ; ; SSE42-LABEL: expandload_v16f32_const: ; SSE42: ## %bb.0: -; SSE42-NEXT: movups (%rdi), %xmm0 -; SSE42-NEXT: movups 16(%rdi), %xmm1 +; SSE42-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE42-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE42-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; SSE42-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE42-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSE42-NEXT: blendps {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] +; SSE42-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] +; SSE42-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE42-NEXT: blendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; SSE42-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSE42-NEXT: blendps {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] +; SSE42-NEXT: blendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] ; SSE42-NEXT: retq @@ -1312,12 +1318,15 @@ ; AVX1OR2-LABEL: expandload_v16f32_const: ; AVX1OR2: ## %bb.0: ; AVX1OR2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7] -; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX1OR2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX1OR2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1OR2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero @@ -1326,7 +1335,7 @@ ; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1OR2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX1OR2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] +; AVX1OR2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] ; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] ; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 @@ -1536,7 +1545,7 @@ ; SSE2-NEXT: je LBB8_64 ; SSE2-NEXT: LBB8_63: ## %cond.load121 ; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm7[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,0] ; SSE2-NEXT: LBB8_64: ## %else122 ; SSE2-NEXT: movaps %xmm0, (%rax) @@ -1556,7 +1565,7 @@ ; SSE2-NEXT: je LBB8_4 ; SSE2-NEXT: LBB8_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm0[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,3] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: movaps %xmm8, %xmm0 @@ -1571,7 +1580,7 @@ ; SSE2-NEXT: je LBB8_8 ; SSE2-NEXT: LBB8_7: ## %cond.load9 ; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,0] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: testb $16, %cl @@ -1584,7 +1593,7 @@ ; SSE2-NEXT: je LBB8_12 ; SSE2-NEXT: LBB8_11: ## %cond.load17 ; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm1[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm1[2,3] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: movaps %xmm8, %xmm1 @@ -1599,7 +1608,7 @@ ; SSE2-NEXT: je LBB8_16 ; SSE2-NEXT: LBB8_15: ## %cond.load25 ; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,0] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: testl $256, %ecx ## imm = 0x100 @@ -1612,7 +1621,7 @@ ; SSE2-NEXT: je LBB8_20 ; SSE2-NEXT: LBB8_19: ## %cond.load33 ; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm2[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[2,3] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: movaps %xmm8, %xmm2 @@ -1627,7 +1636,7 @@ ; SSE2-NEXT: je LBB8_24 ; SSE2-NEXT: LBB8_23: ## %cond.load41 ; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm2[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm2[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,0] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: testl $4096, %ecx ## imm = 0x1000 @@ -1640,7 +1649,7 @@ ; SSE2-NEXT: je LBB8_28 ; SSE2-NEXT: LBB8_27: ## %cond.load49 ; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm3[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm3[0] ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[2,3] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: movaps %xmm8, %xmm3 @@ -1655,7 +1664,7 @@ ; SSE2-NEXT: je LBB8_32 ; SSE2-NEXT: LBB8_31: ## %cond.load57 ; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm3[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,0] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: testl $65536, %ecx ## imm = 0x10000 @@ -1668,7 +1677,7 @@ ; SSE2-NEXT: je LBB8_36 ; SSE2-NEXT: LBB8_35: ## %cond.load65 ; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm4[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm4[0] ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: movaps %xmm8, %xmm4 @@ -1683,7 +1692,7 @@ ; SSE2-NEXT: je LBB8_40 ; SSE2-NEXT: LBB8_39: ## %cond.load73 ; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm4[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: testl $1048576, %ecx ## imm = 0x100000 @@ -1696,7 +1705,7 @@ ; SSE2-NEXT: je LBB8_44 ; SSE2-NEXT: LBB8_43: ## %cond.load81 ; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm5[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm5[0] ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm5[2,3] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: movaps %xmm8, %xmm5 @@ -1711,7 +1720,7 @@ ; SSE2-NEXT: je LBB8_48 ; SSE2-NEXT: LBB8_47: ## %cond.load89 ; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm5[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: testl $16777216, %ecx ## imm = 0x1000000 @@ -1724,7 +1733,7 @@ ; SSE2-NEXT: je LBB8_52 ; SSE2-NEXT: LBB8_51: ## %cond.load97 ; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm6[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0] ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm6[2,3] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: movaps %xmm8, %xmm6 @@ -1739,7 +1748,7 @@ ; SSE2-NEXT: je LBB8_56 ; SSE2-NEXT: LBB8_55: ## %cond.load105 ; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm6[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,0] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: testl $268435456, %ecx ## imm = 0x10000000 @@ -1752,7 +1761,7 @@ ; SSE2-NEXT: je LBB8_60 ; SSE2-NEXT: LBB8_59: ## %cond.load113 ; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm7[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] ; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[2,3] ; SSE2-NEXT: addq $4, %rsi ; SSE2-NEXT: movaps %xmm8, %xmm7 @@ -2830,7 +2839,7 @@ ; SSE2-NEXT: je LBB10_4 ; SSE2-NEXT: LBB10_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSE2-NEXT: addq $4, %rdi ; SSE2-NEXT: movaps %xmm1, %xmm0 @@ -2845,7 +2854,7 @@ ; SSE2-NEXT: je LBB10_8 ; SSE2-NEXT: LBB10_7: ## %cond.load9 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -742,7 +742,7 @@ ; SSE2-NEXT: je LBB7_4 ; SSE2-NEXT: LBB7_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm1, %xmm0 @@ -834,7 +834,7 @@ ; SSE2-NEXT: je LBB8_4 ; SSE2-NEXT: LBB8_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -927,7 +927,7 @@ ; SSE2-NEXT: je LBB9_4 ; SSE2-NEXT: LBB9_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: testb $4, %al @@ -940,7 +940,7 @@ ; SSE2-NEXT: je LBB9_8 ; SSE2-NEXT: LBB9_7: ## %cond.load7 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -1051,8 +1051,8 @@ ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB10_4 ; SSE2-NEXT: LBB10_3: ## %cond.load1 -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: testb $4, %al @@ -1065,7 +1065,7 @@ ; SSE2-NEXT: je LBB10_8 ; SSE2-NEXT: LBB10_7: ## %cond.load7 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB10_10 @@ -1076,7 +1076,7 @@ ; SSE2-NEXT: je LBB10_12 ; SSE2-NEXT: LBB10_11: ## %cond.load13 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: testb $64, %al @@ -1089,7 +1089,7 @@ ; SSE2-NEXT: je LBB10_16 ; SSE2-NEXT: LBB10_15: ## %cond.load19 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE2-NEXT: retq ; @@ -1240,7 +1240,7 @@ ; SSE2-NEXT: je LBB11_16 ; SSE2-NEXT: LBB11_15: ## %cond.load19 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE2-NEXT: LBB11_16: ## %else20 ; SSE2-NEXT: movaps %xmm2, %xmm0 @@ -1253,7 +1253,7 @@ ; SSE2-NEXT: je LBB11_4 ; SSE2-NEXT: LBB11_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: testb $4, %al @@ -1266,7 +1266,7 @@ ; SSE2-NEXT: je LBB11_8 ; SSE2-NEXT: LBB11_7: ## %cond.load7 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB11_10 @@ -1277,7 +1277,7 @@ ; SSE2-NEXT: je LBB11_12 ; SSE2-NEXT: LBB11_11: ## %cond.load13 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,3] ; SSE2-NEXT: movaps %xmm0, %xmm3 ; SSE2-NEXT: testb $64, %al @@ -2069,7 +2069,7 @@ ; SSE2-NEXT: je LBB17_4 ; SSE2-NEXT: LBB17_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm1, %xmm0 @@ -2175,7 +2175,7 @@ ; SSE2-NEXT: je LBB18_4 ; SSE2-NEXT: LBB18_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: testb $4, %al @@ -2188,7 +2188,7 @@ ; SSE2-NEXT: je LBB18_8 ; SSE2-NEXT: LBB18_7: ## %cond.load7 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -2298,7 +2298,7 @@ ; SSE2-NEXT: je LBB19_16 ; SSE2-NEXT: LBB19_15: ## %cond.load19 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE2-NEXT: LBB19_16: ## %else20 ; SSE2-NEXT: movaps %xmm1, %xmm0 @@ -2311,7 +2311,7 @@ ; SSE2-NEXT: je LBB19_4 ; SSE2-NEXT: LBB19_3: ## %cond.load1 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: testb $4, %al @@ -2324,7 +2324,7 @@ ; SSE2-NEXT: je LBB19_8 ; SSE2-NEXT: LBB19_7: ## %cond.load7 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB19_10 @@ -2335,7 +2335,7 @@ ; SSE2-NEXT: je LBB19_12 ; SSE2-NEXT: LBB19_11: ## %cond.load13 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: testb $64, %al @@ -2501,8 +2501,8 @@ ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je LBB20_4 ; SSE2-NEXT: LBB20_3: ## %cond.load1 -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: testb $4, %al @@ -2515,7 +2515,7 @@ ; SSE2-NEXT: je LBB20_8 ; SSE2-NEXT: LBB20_7: ## %cond.load7 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB20_10 @@ -2526,7 +2526,7 @@ ; SSE2-NEXT: je LBB20_12 ; SSE2-NEXT: LBB20_11: ## %cond.load13 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: testb $64, %al @@ -2539,7 +2539,7 @@ ; SSE2-NEXT: je LBB20_16 ; SSE2-NEXT: LBB20_15: ## %cond.load19 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE2-NEXT: retq ; @@ -6131,18 +6131,17 @@ ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[2,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] ; SSE2-NEXT: retq ; ; SSE42-LABEL: mload_constmask_v4f32: ; SSE42: ## %bb.0: ; SSE42-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE42-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE42-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; SSE42-NEXT: retq ; @@ -6227,13 +6226,11 @@ ; SSE2-LABEL: mload_constmask_v4i32: ; SSE2: ## %bb.0: ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[0,2] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -6312,7 +6309,7 @@ ; SSE2: ## %bb.0: ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] @@ -6322,7 +6319,7 @@ ; SSE42-LABEL: mload_constmask_v8f32: ; SSE42: ## %bb.0: ; SSE42-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; SSE42-NEXT: retq @@ -6454,12 +6451,12 @@ ; SSE2: ## %bb.0: ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -3482,9 +3482,8 @@ ; AVX1-LABEL: truncstore_v8i32_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -1179,16 +1179,20 @@ ; SSE41-LABEL: merge_4f32_f32_2345_volatile: ; SSE41: # %bb.0: ; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; SSE41-NEXT: retq ; ; AVX-LABEL: merge_4f32_f32_2345_volatile: ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; AVX-NEXT: retq ; @@ -1207,9 +1211,11 @@ ; X32-SSE41-LABEL: merge_4f32_f32_2345_volatile: ; X32-SSE41: # %bb.0: ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; X32-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; X32-SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-SSE41-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X32-SSE41-NEXT: retl %ptr0 = getelementptr inbounds float, float* %ptr, i64 2 diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -512,16 +512,16 @@ ; SSE2-LABEL: v12i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm2, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[1,3] ; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] ; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,1] +; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE2-NEXT: movaps %xmm2, %xmm5 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,0] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm2[3,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm2[3,3] ; SSE2-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,2] ; SSE2-NEXT: movaps %xmm2, 32(%rdi) @@ -918,7 +918,7 @@ ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,7,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,2,1] ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[2,0] @@ -969,7 +969,7 @@ ; SSE42-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] ; SSE42-NEXT: movdqa %xmm0, %xmm4 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6],xmm1[7] -; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm3[6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13] @@ -994,7 +994,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5],xmm3[6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13] ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] @@ -1198,44 +1198,40 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind { ; SSE2-LABEL: interleave_24i32_out: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqu 64(%rdi), %xmm10 +; SSE2-NEXT: movdqu 64(%rdi), %xmm9 ; SSE2-NEXT: movups 80(%rdi), %xmm8 ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu 16(%rdi), %xmm11 +; SSE2-NEXT: movdqu 16(%rdi), %xmm10 ; SSE2-NEXT: movups 32(%rdi), %xmm5 -; SSE2-NEXT: movdqu 48(%rdi), %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] -; SSE2-NEXT: movaps %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[2,0] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[2,0] -; SSE2-NEXT: movaps %xmm8, %xmm5 +; SSE2-NEXT: movdqu 48(%rdi), %xmm3 +; SSE2-NEXT: movaps %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm10[2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm5[2,0] +; SSE2-NEXT: movaps %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm10[2,0] -; SSE2-NEXT: movdqa %xmm9, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm8[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0] -; SSE2-NEXT: movaps %xmm2, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm10[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm11[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[2,0] -; SSE2-NEXT: movups %xmm9, 16(%rsi) -; SSE2-NEXT: movups %xmm3, (%rsi) -; SSE2-NEXT: movups %xmm2, 16(%rdx) +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm9[2,3] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm8[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm9[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm10[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[2,0] +; SSE2-NEXT: movups %xmm2, 16(%rsi) +; SSE2-NEXT: movups %xmm4, (%rsi) +; SSE2-NEXT: movups %xmm3, 16(%rdx) ; SSE2-NEXT: movups %xmm0, (%rdx) ; SSE2-NEXT: movups %xmm1, 16(%rcx) -; SSE2-NEXT: movups %xmm6, (%rcx) +; SSE2-NEXT: movups %xmm7, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i32_out: @@ -1435,29 +1431,29 @@ ; SSE2-NEXT: movups (%rcx), %xmm3 ; SSE2-NEXT: movups 16(%rcx), %xmm6 ; SSE2-NEXT: movaps %xmm3, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm1[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[1,3] ; SSE2-NEXT: movaps %xmm1, %xmm9 ; SSE2-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[0,2] ; SSE2-NEXT: movaps %xmm5, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,2],xmm6[3,2] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm6[3,3] ; SSE2-NEXT: movaps %xmm6, %xmm4 ; SSE2-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm7[0,2] ; SSE2-NEXT: movaps %xmm0, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,1] +; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] ; SSE2-NEXT: movaps %xmm6, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[1,0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm0[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[1,3] ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,2] ; SSE2-NEXT: movaps %xmm8, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,2],xmm3[3,2] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm3[3,3] ; SSE2-NEXT: movaps %xmm3, %xmm6 ; SSE2-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm5[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,1] +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm8[1,0] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[0,2] ; SSE2-NEXT: movups %xmm3, 16(%rdi) @@ -1514,21 +1510,21 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovupd (%rsi), %ymm0 ; AVX1-NEXT: vmovups (%rdx), %xmm1 -; AVX1-NEXT: vmovups 16(%rdx), %xmm2 -; AVX1-NEXT: vmovups (%rsi), %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,0],xmm1[2,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[0,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = mem[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX1-NEXT: vmovups 16(%rcx), %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,0],xmm3[3,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,1],xmm4[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[1,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[2,2] +; AVX1-NEXT: vmovups (%rsi), %xmm2 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,1] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = mem[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX1-NEXT: vmovups 16(%rcx), %xmm2 +; AVX1-NEXT: vmovups 16(%rdx), %xmm3 +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,0],xmm2[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,1],xmm4[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[2,2] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,1,3,3] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] @@ -1606,21 +1602,21 @@ ; XOP-NEXT: vmovupd (%rsi), %ymm0 ; XOP-NEXT: vmovups (%rcx), %ymm1 ; XOP-NEXT: vmovups (%rdx), %xmm2 -; XOP-NEXT: vmovups 16(%rdx), %xmm3 -; XOP-NEXT: vmovups (%rsi), %xmm4 -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm2[2,0] -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2] -; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm4[0,0] -; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1] -; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; XOP-NEXT: vpermilps {{.*#+}} xmm4 = mem[0,1,0,1] -; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; XOP-NEXT: vmovups 16(%rcx), %xmm4 -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm3[3,0],xmm4[3,0] -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,1],xmm5[0,2] -; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[1,0] -; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,2] +; XOP-NEXT: vmovups (%rsi), %xmm3 +; XOP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm3[1],xmm2[1] +; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[1,1],xmm4[0,2] +; XOP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[2,1] +; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; XOP-NEXT: vpermilps {{.*#+}} xmm3 = mem[0,1,0,1] +; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 +; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; XOP-NEXT: vmovups 16(%rcx), %xmm3 +; XOP-NEXT: vmovups 16(%rdx), %xmm4 +; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,0],xmm3[3,0] +; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,1],xmm5[0,2] +; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[1,0] +; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,2] ; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; XOP-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3] ; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -31,25 +31,15 @@ ; SSE42-NEXT: movd %xmm1, (%rdi) ; SSE42-NEXT: retq ; -; AVX1-LABEL: insert_v7i8_v2i16_2: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-NEXT: vpextrb $6, %xmm1, 6(%rdi) -; AVX1-NEXT: vpextrw $1, %xmm0, 4(%rdi) -; AVX1-NEXT: vmovd %xmm2, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: insert_v7i8_v2i16_2: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-NEXT: vpextrb $6, %xmm1, 6(%rdi) -; AVX2-NEXT: vpextrw $1, %xmm0, 4(%rdi) -; AVX2-NEXT: vmovd %xmm2, (%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: insert_v7i8_v2i16_2: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpextrb $6, %xmm1, 6(%rdi) +; AVX-NEXT: vpextrw $1, %xmm0, 4(%rdi) +; AVX-NEXT: vmovd %xmm2, (%rdi) +; AVX-NEXT: retq ; ; AVX512-LABEL: insert_v7i8_v2i16_2: ; AVX512: # %bb.0: @@ -60,16 +50,6 @@ ; AVX512-NEXT: vpextrw $1, %xmm0, 4(%rdi) ; AVX512-NEXT: vmovd %xmm2, (%rdi) ; AVX512-NEXT: retq -; -; XOP-LABEL: insert_v7i8_v2i16_2: -; XOP: # %bb.0: -; XOP-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; XOP-NEXT: vpextrb $6, %xmm1, 6(%rdi) -; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,1,2,3],xmm1[6,7,u,u,u,u,u,u,u,u] -; XOP-NEXT: vpextrw $1, %xmm0, 4(%rdi) -; XOP-NEXT: vmovd %xmm1, (%rdi) -; XOP-NEXT: retq %1 = load <2 x i16>, <2 x i16> *%a1 %2 = bitcast <2 x i16> %1 to <4 x i8> %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <7 x i32> diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -1236,70 +1236,67 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE2-LABEL: mul_v8i64_sext: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] -; SSE2-NEXT: psrad $16, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: movdqa %xmm1, %xmm15 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; SSE2-NEXT: psrad $16, %xmm14 ; SSE2-NEXT: pxor %xmm13, %xmm13 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm13 -; SSE2-NEXT: movdqa %xmm10, %xmm9 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm13[2],xmm9[3],xmm13[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm14, %xmm10 +; SSE2-NEXT: movdqa %xmm14, %xmm8 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pxor %xmm15, %xmm15 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm15 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 ; SSE2-NEXT: movdqa %xmm0, %xmm11 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] -; SSE2-NEXT: pxor %xmm14, %xmm14 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm14 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] +; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] -; SSE2-NEXT: movdqa %xmm15, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm12, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm0, %xmm7 -; SSE2-NEXT: paddq %xmm4, %xmm7 -; SSE2-NEXT: psllq $32, %xmm7 -; SSE2-NEXT: pmuludq %xmm12, %xmm0 -; SSE2-NEXT: paddq %xmm7, %xmm0 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; SSE2-NEXT: pmuludq %xmm1, %xmm15 -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm11, %xmm6 -; SSE2-NEXT: paddq %xmm15, %xmm6 -; SSE2-NEXT: psllq $32, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE2-NEXT: pcmpgtd %xmm15, %xmm13 +; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm15, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm4 +; SSE2-NEXT: paddq %xmm6, %xmm4 +; SSE2-NEXT: psllq $32, %xmm4 +; SSE2-NEXT: pmuludq %xmm15, %xmm0 +; SSE2-NEXT: paddq %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm11, %xmm5 +; SSE2-NEXT: paddq %xmm4, %xmm5 +; SSE2-NEXT: psllq $32, %xmm5 ; SSE2-NEXT: pmuludq %xmm11, %xmm1 -; SSE2-NEXT: paddq %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm13, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE2-NEXT: paddq %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm14, %xmm5 ; SSE2-NEXT: paddq %xmm4, %xmm5 ; SSE2-NEXT: psllq $32, %xmm5 -; SSE2-NEXT: pmuludq %xmm10, %xmm2 +; SSE2-NEXT: pmuludq %xmm14, %xmm2 ; SSE2-NEXT: paddq %xmm5, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] -; SSE2-NEXT: pmuludq %xmm3, %xmm13 -; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm9, %xmm14 -; SSE2-NEXT: paddq %xmm13, %xmm14 -; SSE2-NEXT: psllq $32, %xmm14 -; SSE2-NEXT: pmuludq %xmm9, %xmm3 -; SSE2-NEXT: paddq %xmm14, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm8, %xmm5 +; SSE2-NEXT: paddq %xmm4, %xmm5 +; SSE2-NEXT: psllq $32, %xmm5 +; SSE2-NEXT: pmuludq %xmm8, %xmm3 +; SSE2-NEXT: paddq %xmm5, %xmm3 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mul_v8i64_sext: @@ -1350,8 +1347,8 @@ define <2 x i64> @pmuldq_square(<2 x i64> %x) { ; SSE2-LABEL: pmuldq_square: ; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] ; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] diff --git a/llvm/test/CodeGen/X86/pr29112.ll b/llvm/test/CodeGen/X86/pr29112.ll --- a/llvm/test/CodeGen/X86/pr29112.ll +++ b/llvm/test/CodeGen/X86/pr29112.ll @@ -34,7 +34,7 @@ ; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[3,1,2,3] ; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3] +; CHECK-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3] ; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm3[3] @@ -42,7 +42,7 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[1] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm8 -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3],xmm11[3] +; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2] ; CHECK-NEXT: vaddps %xmm2, %xmm14, %xmm2 ; CHECK-NEXT: vmovaps %xmm13, %xmm1 diff --git a/llvm/test/CodeGen/X86/pr30430.ll b/llvm/test/CodeGen/X86/pr30430.ll --- a/llvm/test/CodeGen/X86/pr30430.ll +++ b/llvm/test/CodeGen/X86/pr30430.ll @@ -61,16 +61,16 @@ ; CHECK-NEXT: vmovss %xmm23, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; CHECK-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; CHECK-NEXT: # implicit-def: $ymm2 @@ -78,16 +78,16 @@ ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] +; CHECK-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] ; CHECK-NEXT: # implicit-def: $ymm3 diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -561,7 +561,7 @@ ; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -943,7 +943,7 @@ ; AVX1-NEXT: vpandn %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm1 ; AVX1-NEXT: vpandn %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -1071,7 +1071,7 @@ ; AVX1-NEXT: vpandn %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm1 ; AVX1-NEXT: vpandn %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -899,7 +899,7 @@ ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm1 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi @@ -935,7 +935,7 @@ ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-SSE-NEXT: psraw $8, %xmm1 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $16, %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) ; X64-SSE-NEXT: retq @@ -1472,7 +1472,7 @@ ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm0 ; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl @@ -1497,7 +1497,7 @@ ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-SSE-NEXT: psraw $8, %xmm0 ; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $16, %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq @@ -2155,14 +2155,14 @@ ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X86-SSE-NEXT: movd %xmm0, %eax -; X86-SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm4[0,0] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm4[0] ; X86-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199] ; X86-SSE-NEXT: pmuludq %xmm0, %xmm7 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] ; X86-SSE-NEXT: pmuludq %xmm0, %xmm5 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; X86-SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm1[0,0] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; X86-SSE-NEXT: pmuludq %xmm0, %xmm3 ; X86-SSE-NEXT: pmuludq %xmm0, %xmm6 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] @@ -2399,7 +2399,7 @@ ; X64-SSE-NEXT: movd %xmm0, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 32(%rsi) -; X64-SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm5[0,0] +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0] ; X64-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199] ; X64-SSE-NEXT: pmuludq %xmm0, %xmm7 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] @@ -2408,7 +2408,7 @@ ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X64-SSE-NEXT: pmuludq %xmm0, %xmm4 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; X64-SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm8[0,0] +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm8[0] ; X64-SSE-NEXT: pmuludq %xmm0, %xmm6 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] diff --git a/llvm/test/CodeGen/X86/shuffle-of-insert.ll b/llvm/test/CodeGen/X86/shuffle-of-insert.ll --- a/llvm/test/CodeGen/X86/shuffle-of-insert.ll +++ b/llvm/test/CodeGen/X86/shuffle-of-insert.ll @@ -30,7 +30,7 @@ ; SSE2-LABEL: ins_elt_1: ; SSE2: # %bb.0: ; SSE2-NEXT: movd %edi, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: retq ; @@ -80,7 +80,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: movd %edi, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE2-NEXT: retq ; @@ -153,7 +153,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: movd %edi, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE2-NEXT: retq ; @@ -176,7 +176,7 @@ ; SSE2-LABEL: ins_elt_3_to_1: ; SSE2: # %bb.0: ; SSE2-NEXT: movd %edi, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll @@ -13,19 +13,7 @@ ; SSE2-LABEL: shuffle_v16i8_to_v8i8_1: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movq %xmm0, (%rsi) ; SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll @@ -63,8 +63,8 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) @@ -75,8 +75,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] ; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) @@ -87,8 +87,8 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -215,7 +215,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -764,7 +764,7 @@ ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v2i64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -971,7 +971,7 @@ ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v8i16: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -155,7 +155,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> ; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14] @@ -168,7 +168,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> ; AVX512BW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] @@ -190,7 +190,7 @@ ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31] +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> ; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] @@ -704,9 +704,13 @@ ; ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] -; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 -; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm1 = [5569058560453190945,5569058560453190945,5569058560453190945,5569058560453190945,5569058560453190945,5569058560453190945,5569058560453190945,5569058560453190945] +; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VBMI-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512VBMI-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 +; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm2 = [6726501325053953281,6726501325053953281,6726501325053953281,6726501325053953281,6726501325053953281,6726501325053953281,6726501325053953281,6726501325053953281] +; AVX512VBMI-NEXT: vpermi2b %zmm0, %zmm0, %zmm2 +; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3] ; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq ; @@ -789,9 +793,13 @@ ; ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62] -; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 -; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm1 = [5641116154491118881,5641116154491118881,5641116154491118881,5641116154491118881,5641116154491118881,5641116154491118881,5641116154491118881,5641116154491118881] +; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VBMI-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512VBMI-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 +; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm2 = [6726501325053953281,6726501325053953281,6726501325053953281,6726501325053953281,6726501325053953281,6726501325053953281,6726501325053953281,6726501325053953281] +; AVX512VBMI-NEXT: vpermi2b %zmm0, %zmm0, %zmm2 +; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3] ; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq ; @@ -812,10 +820,9 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqu (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512F-NEXT: retq @@ -826,9 +833,8 @@ ; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,2,3] -; AVX512VL-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512VL-NEXT: retq ; @@ -836,10 +842,9 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqu 32(%rdi), %xmm1 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512BW-NEXT: retq @@ -857,10 +862,9 @@ ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vmovdqu (%rdi), %xmm0 ; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %xmm1 -; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512VBMI-NEXT: retq diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -1075,52 +1075,52 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,2147483649,1374389535] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2454267027,2147483649,1374389535] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $1, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm5 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 ; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,16,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movl $1, %eax +; CHECK-SSE2-NEXT: movd %eax, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,14,16,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq @@ -1406,54 +1406,54 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2454267027,0,1374389535] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $1, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm5 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 ; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,1,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movl $1, %eax +; CHECK-SSE2-NEXT: movd %eax, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,14,1,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq @@ -2709,52 +2709,52 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2147483649,0,1717986919] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2147483649,0,1717986919] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $1, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 +; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm5 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movl $1, %eax +; CHECK-SSE2-NEXT: movd %eax, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $1, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm5 ; CHECK-SSE2-NEXT: psrad $3, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,16,1,5] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,16,1,5] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq @@ -2980,54 +2980,54 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2147483649,0,1374389535] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2147483649,0,1374389535] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $1, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm5 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 ; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,16,1,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movl $1, %eax +; CHECK-SSE2-NEXT: movd %eax, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,16,1,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -2003,12 +2003,12 @@ ; X86-AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero ; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c] ; X86-AVX1-NEXT: # xmm1 = mem[0],zero,zero,zero -; X86-AVX1-NEXT: vinsertps $16, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10] -; X86-AVX1-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; X86-AVX1-NEXT: vunpcklps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x14,0xc1] +; X86-AVX1-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x08] ; X86-AVX1-NEXT: # xmm1 = mem[0],zero,zero,zero -; X86-AVX1-NEXT: vinsertps $32, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x20] -; X86-AVX1-NEXT: # xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; X86-AVX1-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0xc1] +; X86-AVX1-NEXT: # xmm0 = xmm0[0],xmm1[0] ; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04] ; X86-AVX1-NEXT: # xmm1 = mem[0],zero,zero,zero ; X86-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] @@ -2021,8 +2021,8 @@ ; X86-AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero ; X86-AVX512-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c] ; X86-AVX512-NEXT: # xmm1 = mem[0],zero,zero,zero -; X86-AVX512-NEXT: vinsertps $16, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10] -; X86-AVX512-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; X86-AVX512-NEXT: vunpcklps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc1] +; X86-AVX512-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-AVX512-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x08] ; X86-AVX512-NEXT: # xmm1 = mem[0],zero,zero,zero ; X86-AVX512-NEXT: vinsertps $32, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x20] @@ -2046,18 +2046,18 @@ ; ; X64-AVX1-LABEL: test_mm_set_ps: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vinsertps $16, %xmm2, %xmm3, %xmm2 # encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10] -; X64-AVX1-NEXT: # xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; X64-AVX1-NEXT: vinsertps $32, %xmm1, %xmm2, %xmm1 # encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20] -; X64-AVX1-NEXT: # xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; X64-AVX1-NEXT: vunpcklps %xmm2, %xmm3, %xmm2 # encoding: [0xc5,0xe0,0x14,0xd2] +; X64-AVX1-NEXT: # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X64-AVX1-NEXT: vmovlhps %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe8,0x16,0xc9] +; X64-AVX1-NEXT: # xmm1 = xmm2[0],xmm1[0] ; X64-AVX1-NEXT: vinsertps $48, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30] ; X64-AVX1-NEXT: # xmm0 = xmm1[0,1,2],xmm0[0] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_set_ps: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vinsertps $16, %xmm2, %xmm3, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10] -; X64-AVX512-NEXT: # xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; X64-AVX512-NEXT: vunpcklps %xmm2, %xmm3, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe0,0x14,0xd2] +; X64-AVX512-NEXT: # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; X64-AVX512-NEXT: vinsertps $32, %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20] ; X64-AVX512-NEXT: # xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; X64-AVX512-NEXT: vinsertps $48, %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30] @@ -2340,10 +2340,10 @@ ; X86-AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero ; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm3 # encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04] ; X86-AVX1-NEXT: # xmm3 = mem[0],zero,zero,zero -; X86-AVX1-NEXT: vinsertps $16, %xmm2, %xmm3, %xmm2 # encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10] -; X86-AVX1-NEXT: # xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; X86-AVX1-NEXT: vinsertps $32, %xmm1, %xmm2, %xmm1 # encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20] -; X86-AVX1-NEXT: # xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; X86-AVX1-NEXT: vunpcklps %xmm2, %xmm3, %xmm2 # encoding: [0xc5,0xe0,0x14,0xd2] +; X86-AVX1-NEXT: # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X86-AVX1-NEXT: vmovlhps %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe8,0x16,0xc9] +; X86-AVX1-NEXT: # xmm1 = xmm2[0],xmm1[0] ; X86-AVX1-NEXT: vinsertps $48, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30] ; X86-AVX1-NEXT: # xmm0 = xmm1[0,1,2],xmm0[0] ; X86-AVX1-NEXT: retl # encoding: [0xc3] @@ -2358,8 +2358,8 @@ ; X86-AVX512-NEXT: # xmm2 = mem[0],zero,zero,zero ; X86-AVX512-NEXT: vmovss {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04] ; X86-AVX512-NEXT: # xmm3 = mem[0],zero,zero,zero -; X86-AVX512-NEXT: vinsertps $16, %xmm2, %xmm3, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10] -; X86-AVX512-NEXT: # xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; X86-AVX512-NEXT: vunpcklps %xmm2, %xmm3, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe0,0x14,0xd2] +; X86-AVX512-NEXT: # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; X86-AVX512-NEXT: vinsertps $32, %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20] ; X86-AVX512-NEXT: # xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; X86-AVX512-NEXT: vinsertps $48, %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30] @@ -2378,18 +2378,18 @@ ; ; X64-AVX1-LABEL: test_mm_setr_ps: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vinsertps $16, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10] -; X64-AVX1-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; X64-AVX1-NEXT: vinsertps $32, %xmm2, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x20] -; X64-AVX1-NEXT: # xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; X64-AVX1-NEXT: vunpcklps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x14,0xc1] +; X64-AVX1-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-AVX1-NEXT: vmovlhps %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0xc2] +; X64-AVX1-NEXT: # xmm0 = xmm0[0],xmm2[0] ; X64-AVX1-NEXT: vinsertps $48, %xmm3, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc3,0x30] ; X64-AVX1-NEXT: # xmm0 = xmm0[0,1,2],xmm3[0] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_setr_ps: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vinsertps $16, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10] -; X64-AVX512-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; X64-AVX512-NEXT: vunpcklps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc1] +; X64-AVX512-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-AVX512-NEXT: vinsertps $32, %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x20] ; X64-AVX512-NEXT: # xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; X64-AVX512-NEXT: vinsertps $48, %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc3,0x30] diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll --- a/llvm/test/CodeGen/X86/sse2.ll +++ b/llvm/test/CodeGen/X86/sse2.ll @@ -321,12 +321,19 @@ ; X64-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: test9: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test9: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X64-AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test9: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; X64-AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; X64-AVX512-NEXT: retq %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1] %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1] @@ -352,12 +359,19 @@ ; X64-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: test10: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test10: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X64-AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test10: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; X64-AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; X64-AVX512-NEXT: retq %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1] %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1] diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll --- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll +++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -468,8 +468,8 @@ ; AVX-NEXT: vsubss %xmm5, %xmm4, %xmm4 ; AVX-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm5, %xmm2 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -539,8 +539,8 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] ; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm5, %xmm4, %xmm4 -; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] -; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; AVX512-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] ; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[3,1,2,3] ; AVX512-NEXT: vaddss %xmm4, %xmm3, %xmm3 @@ -554,8 +554,8 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] ; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm6, %xmm5, %xmm5 -; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] -; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX512-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX512-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -656,8 +656,8 @@ ; X86-AVX512-LABEL: pinsrd_from_shufflevector_i32: ; X86-AVX512: ## %bb.0: ## %entry ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vpermilps $36, (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0x08,0x24] -; X86-AVX512-NEXT: ## xmm1 = mem[0,1,2,0] +; X86-AVX512-NEXT: vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08] +; X86-AVX512-NEXT: vbroadcastss %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc9] ; X86-AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] ; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] @@ -680,8 +680,8 @@ ; ; X64-AVX512-LABEL: pinsrd_from_shufflevector_i32: ; X64-AVX512: ## %bb.0: ## %entry -; X64-AVX512-NEXT: vpermilps $36, (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0x0f,0x24] -; X64-AVX512-NEXT: ## xmm1 = mem[0,1,2,0] +; X64-AVX512-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] +; X64-AVX512-NEXT: vbroadcastss %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc9] ; X64-AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] ; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] @@ -1226,8 +1226,7 @@ ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] ; AVX512-NEXT: vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01] ; AVX512-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3] -; AVX512-NEXT: vpermilps $36, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x24] -; AVX512-NEXT: ## xmm1 = xmm1[0,1,2,0] +; AVX512-NEXT: vbroadcastss %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc9] ; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] ; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] @@ -2060,8 +2059,8 @@ ; ; AVX512-LABEL: insertps_8: ; AVX512: ## %bb.0: -; AVX512-NEXT: vinsertps $28, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x1c] -; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[0],zero,zero +; AVX512-NEXT: vpunpckldq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x62,0xc1] +; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512-NEXT: vmovq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc0] ; AVX512-NEXT: ## xmm0 = xmm0[0],zero ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/test-shrink-bug.ll b/llvm/test/CodeGen/X86/test-shrink-bug.ll --- a/llvm/test/CodeGen/X86/test-shrink-bug.ll +++ b/llvm/test/CodeGen/X86/test-shrink-bug.ll @@ -70,8 +70,8 @@ ; CHECK-X64-NEXT: je .LBB1_3 ; CHECK-X64-NEXT: # %bb.1: ; CHECK-X64-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 -; CHECK-X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-X64-NEXT: pextrw $1, %xmm0, %eax +; CHECK-X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8] +; CHECK-X64-NEXT: pextrw $4, %xmm0, %eax ; CHECK-X64-NEXT: testb $1, %al ; CHECK-X64-NEXT: jne .LBB1_3 ; CHECK-X64-NEXT: # %bb.2: # %no diff --git a/llvm/test/CodeGen/X86/trunc-subvector.ll b/llvm/test/CodeGen/X86/trunc-subvector.ll --- a/llvm/test/CodeGen/X86/trunc-subvector.ll +++ b/llvm/test/CodeGen/X86/trunc-subvector.ll @@ -79,12 +79,8 @@ ; ; AVX2-LABEL: test5: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [17179869187,17179869187,17179869187,17179869187] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -181,12 +177,8 @@ ; ; AVX2-LABEL: test10: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [17179869187,17179869187,17179869187,17179869187] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/udiv_fix.ll b/llvm/test/CodeGen/X86/udiv_fix.ll --- a/llvm/test/CodeGen/X86/udiv_fix.ll +++ b/llvm/test/CodeGen/X86/udiv_fix.ll @@ -238,36 +238,37 @@ ; X64-LABEL: vec: ; X64: # %bb.0: ; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: movdqa %xmm1, %xmm3 +; X64-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-NEXT: movq %xmm3, %rcx +; X64-NEXT: movdqa %xmm0, %xmm4 ; X64-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; X64-NEXT: movq %xmm4, %rcx -; X64-NEXT: movdqa %xmm0, %xmm5 -; X64-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; X64-NEXT: psllq $31, %xmm5 -; X64-NEXT: movq %xmm5, %rax +; X64-NEXT: psllq $31, %xmm4 +; X64-NEXT: movq %xmm4, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm3 ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] -; X64-NEXT: movq %xmm4, %rcx -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1] ; X64-NEXT: movq %xmm4, %rax +; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: psrldq {{.*#+}} xmm4 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: movq %xmm4, %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm4 ; X64-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-NEXT: movq %xmm1, %rcx ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; X64-NEXT: movq %xmm2, %rcx ; X64-NEXT: psllq $31, %xmm0 ; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X64-NEXT: movq %xmm1, %rcx ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; X64-NEXT: movq %xmm0, %rax +; X64-NEXT: psrlq $32, %xmm1 +; X64-NEXT: movq %xmm1, %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm0 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -202,26 +202,24 @@ ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: psrld $1, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: psrld $31, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 @@ -303,26 +301,24 @@ ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: psrld $1, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: psrld $31, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 @@ -646,13 +642,12 @@ ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 ; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 @@ -730,25 +725,23 @@ ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: psrld $1, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 @@ -866,10 +859,9 @@ ; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm2 ; CHECK-SSE41-NEXT: psrld $5, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 ; CHECK-SSE41-NEXT: psrld $2, %xmm3 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] @@ -891,11 +883,11 @@ ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -999,27 +991,24 @@ ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: psrld $1, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; CHECK-SSE2-NEXT: movl $-1840700269, %eax # imm = 0x92492493 -; CHECK-SSE2-NEXT: movd %eax, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: movd %eax, %xmm3 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 @@ -1118,18 +1107,16 @@ ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 ; CHECK-SSE2-NEXT: psrld $5, %xmm1 ; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: movaps %xmm0, %xmm3 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,14,1,100] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: movaps %xmm0, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 @@ -1228,13 +1215,12 @@ ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 ; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 @@ -1312,25 +1298,23 @@ ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: psrld $1, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 @@ -1448,10 +1432,9 @@ ; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm2 ; CHECK-SSE41-NEXT: psrld $5, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 ; CHECK-SSE41-NEXT: psrld $2, %xmm3 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] @@ -1473,11 +1456,11 @@ ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1541,7 +1524,7 @@ ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] @@ -1559,10 +1542,9 @@ ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm2 ; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 ; CHECK-SSE41-NEXT: psrld $31, %xmm3 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] @@ -1582,12 +1564,12 @@ ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1638,8 +1620,7 @@ ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -1652,7 +1633,7 @@ ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] @@ -1673,14 +1654,13 @@ ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 ; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 ; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 ; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 @@ -1698,12 +1678,12 @@ ; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1759,7 +1739,7 @@ ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 ; CHECK-SSE2-NEXT: psrld $2, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] ; CHECK-SSE2-NEXT: psrld $5, %xmm2 ; CHECK-SSE2-NEXT: psrld $31, %xmm1 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[3,3] @@ -1784,15 +1764,15 @@ ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE41-NEXT: psrld $31, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; CHECK-SSE41-NEXT: psrld $2, %xmm3 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: psrld $5, %xmm1 -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 ; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 @@ -1808,13 +1788,13 @@ ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1906,22 +1886,19 @@ ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 ; CHECK-SSE2-NEXT: psrld $2, %xmm2 ; CHECK-SSE2-NEXT: psrld $31, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[3,3] ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [14,4294967295,1,14] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: movaps %xmm0, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -2033,9 +2010,7 @@ ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] ; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] @@ -2139,18 +2114,16 @@ ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,16,1,5] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 @@ -2238,23 +2211,20 @@ ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: movaps %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,16,1,14] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 @@ -2351,19 +2321,17 @@ ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,16,1,100] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $5, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psrld $5, %xmm1 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,16,1,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 @@ -2381,7 +2349,7 @@ ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 ; CHECK-SSE41-NEXT: psrld $5, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] ; CHECK-SSE41-NEXT: psrld $2, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] @@ -2401,8 +2369,8 @@ ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] ; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] @@ -2461,7 +2429,7 @@ ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 ; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[2,1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,1] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -2485,15 +2453,15 @@ ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm2 ; CHECK-SSE41-NEXT: psrld $2, %xmm2 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm0[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 +; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 @@ -2507,11 +2475,11 @@ ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -2562,14 +2530,13 @@ ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 ; CHECK-SSE2-NEXT: psrld $1, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 ; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [14,4294967295,16,1] ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -2596,15 +2563,15 @@ ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5,6,7] ; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm2 ; CHECK-SSE41-NEXT: psrld $2, %xmm2 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm0[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 +; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 @@ -2620,11 +2587,11 @@ ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3,4,5,6,7] ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll @@ -280,24 +280,20 @@ ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,2,2] -; CHECK-SSE2-NEXT: psrld $1, %xmm2 -; CHECK-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,2,3] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrld $1, %xmm3 +; CHECK-SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,2,3] +; CHECK-SSE2-NEXT: movapd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 ; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 @@ -306,15 +302,13 @@ ; CHECK-SSE41-LABEL: t32_tautological: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrld $1, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrld $1, %xmm3 +; CHECK-SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 ; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 @@ -324,14 +318,12 @@ ; CHECK-AVX1-LABEL: t32_tautological: ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -341,12 +333,11 @@ ; CHECK-AVX2-LABEL: t32_tautological: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; CHECK-AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] ; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec-libcalls.ll b/llvm/test/CodeGen/X86/vec-libcalls.ll --- a/llvm/test/CodeGen/X86/vec-libcalls.ll +++ b/llvm/test/CodeGen/X86/vec-libcalls.ll @@ -100,14 +100,14 @@ ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: callq sinf ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,0] ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[3,1,2,3] ; CHECK-NEXT: callq sinf @@ -132,14 +132,14 @@ ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: callq sinf ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,0] ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[3,1,2,3] ; CHECK-NEXT: callq sinf @@ -183,14 +183,14 @@ ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: callq sinf ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,0] ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[3,1,2,3] ; CHECK-NEXT: callq sinf diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -866,40 +866,42 @@ } define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 { -; AVX-32-LABEL: sitofp_v4i64_v4f32: -; AVX-32: # %bb.0: -; AVX-32-NEXT: pushl %ebp -; AVX-32-NEXT: .cfi_def_cfa_offset 8 -; AVX-32-NEXT: .cfi_offset %ebp, -8 -; AVX-32-NEXT: movl %esp, %ebp -; AVX-32-NEXT: .cfi_def_cfa_register %ebp -; AVX-32-NEXT: andl $-8, %esp -; AVX-32-NEXT: subl $48, %esp -; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fstps {{[0-9]+}}(%esp) -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fstps {{[0-9]+}}(%esp) -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fstps {{[0-9]+}}(%esp) -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fstps (%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; AVX-32-NEXT: movl %ebp, %esp -; AVX-32-NEXT: popl %ebp -; AVX-32-NEXT: .cfi_def_cfa %esp, 4 -; AVX-32-NEXT: vzeroupper -; AVX-32-NEXT: retl +; AVX1-32-LABEL: sitofp_v4i64_v4f32: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: pushl %ebp +; AVX1-32-NEXT: .cfi_def_cfa_offset 8 +; AVX1-32-NEXT: .cfi_offset %ebp, -8 +; AVX1-32-NEXT: movl %esp, %ebp +; AVX1-32-NEXT: .cfi_def_cfa_register %ebp +; AVX1-32-NEXT: andl $-8, %esp +; AVX1-32-NEXT: subl $48, %esp +; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fstps (%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-32-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX1-32-NEXT: movl %ebp, %esp +; AVX1-32-NEXT: popl %ebp +; AVX1-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX1-32-NEXT: vzeroupper +; AVX1-32-NEXT: retl ; ; AVX1-64-LABEL: sitofp_v4i64_v4f32: ; AVX1-64: # %bb.0: @@ -907,58 +909,168 @@ ; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX1-64-NEXT: vmovq %xmm0, %rax ; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-64-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-64-NEXT: vmovq %xmm0, %rax ; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 ; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-64-NEXT: vzeroupper ; AVX1-64-NEXT: retq ; +; AVX2-32-LABEL: sitofp_v4i64_v4f32: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: pushl %ebp +; AVX2-32-NEXT: .cfi_def_cfa_offset 8 +; AVX2-32-NEXT: .cfi_offset %ebp, -8 +; AVX2-32-NEXT: movl %esp, %ebp +; AVX2-32-NEXT: .cfi_def_cfa_register %ebp +; AVX2-32-NEXT: andl $-8, %esp +; AVX2-32-NEXT: subl $48, %esp +; AVX2-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fstps (%esp) +; AVX2-32-NEXT: wait +; AVX2-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-32-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX2-32-NEXT: movl %ebp, %esp +; AVX2-32-NEXT: popl %ebp +; AVX2-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX2-32-NEXT: vzeroupper +; AVX2-32-NEXT: retl +; ; AVX2-64-LABEL: sitofp_v4i64_v4f32: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX2-64-NEXT: vmovq %xmm0, %rax ; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX2-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX2-64-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-64-NEXT: vmovq %xmm0, %rax ; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX2-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX2-64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 ; AVX2-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-64-NEXT: vzeroupper ; AVX2-64-NEXT: retq ; +; AVX512F-32-LABEL: sitofp_v4i64_v4f32: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-8, %esp +; AVX512F-32-NEXT: subl $48, %esp +; AVX512F-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512F-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512F-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512F-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fstps (%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512F-32-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512F-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: vzeroupper +; AVX512F-32-NEXT: retl +; ; AVX512F-64-LABEL: sitofp_v4i64_v4f32: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax ; AVX512F-64-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX512F-64-NEXT: vmovq %xmm0, %rax ; AVX512F-64-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512F-64-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512F-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-64-NEXT: vmovq %xmm0, %rax ; AVX512F-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512F-64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax ; AVX512F-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 ; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512F-64-NEXT: vzeroupper ; AVX512F-64-NEXT: retq ; +; AVX512VL-32-LABEL: sitofp_v4i64_v4f32: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $48, %esp +; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fstps (%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512VL-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: vzeroupper +; AVX512VL-32-NEXT: retl +; ; AVX512VL-64-LABEL: sitofp_v4i64_v4f32: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax ; AVX512VL-64-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX512VL-64-NEXT: vmovq %xmm0, %rax ; AVX512VL-64-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512VL-64-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512VL-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-64-NEXT: vmovq %xmm0, %rax ; AVX512VL-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 @@ -989,55 +1101,57 @@ } define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 { -; AVX-32-LABEL: uitofp_v4i64_v4f32: -; AVX-32: # %bb.0: -; AVX-32-NEXT: pushl %ebp -; AVX-32-NEXT: .cfi_def_cfa_offset 8 -; AVX-32-NEXT: .cfi_offset %ebp, -8 -; AVX-32-NEXT: movl %esp, %ebp -; AVX-32-NEXT: .cfi_def_cfa_register %ebp -; AVX-32-NEXT: andl $-8, %esp -; AVX-32-NEXT: subl $48, %esp -; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,0,1] -; AVX-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vextractps $1, %xmm0, %eax -; AVX-32-NEXT: shrl $31, %eax -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) -; AVX-32-NEXT: fstps (%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vextractps $3, %xmm0, %eax -; AVX-32-NEXT: shrl $31, %eax -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) -; AVX-32-NEXT: fstps {{[0-9]+}}(%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vextractps $1, %xmm1, %eax -; AVX-32-NEXT: shrl $31, %eax -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) -; AVX-32-NEXT: fstps {{[0-9]+}}(%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vextractps $3, %xmm1, %eax -; AVX-32-NEXT: shrl $31, %eax -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) -; AVX-32-NEXT: fstps {{[0-9]+}}(%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; AVX-32-NEXT: movl %ebp, %esp -; AVX-32-NEXT: popl %ebp -; AVX-32-NEXT: .cfi_def_cfa %esp, 4 -; AVX-32-NEXT: vzeroupper -; AVX-32-NEXT: retl +; AVX1-32-LABEL: uitofp_v4i64_v4f32: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: pushl %ebp +; AVX1-32-NEXT: .cfi_def_cfa_offset 8 +; AVX1-32-NEXT: .cfi_offset %ebp, -8 +; AVX1-32-NEXT: movl %esp, %ebp +; AVX1-32-NEXT: .cfi_def_cfa_register %ebp +; AVX1-32-NEXT: andl $-8, %esp +; AVX1-32-NEXT: subl $48, %esp +; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX1-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vextractps $1, %xmm0, %eax +; AVX1-32-NEXT: shrl $31, %eax +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX1-32-NEXT: fstps (%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vextractps $3, %xmm0, %eax +; AVX1-32-NEXT: shrl $31, %eax +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vextractps $1, %xmm1, %eax +; AVX1-32-NEXT: shrl $31, %eax +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vextractps $3, %xmm1, %eax +; AVX1-32-NEXT: shrl $31, %eax +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-32-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX1-32-NEXT: movl %ebp, %esp +; AVX1-32-NEXT: popl %ebp +; AVX1-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX1-32-NEXT: vzeroupper +; AVX1-32-NEXT: retl ; ; AVX1-64-LABEL: uitofp_v4i64_v4f32: ; AVX1-64: # %bb.0: @@ -1052,11 +1166,11 @@ ; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 ; AVX1-64-NEXT: vmovq %xmm1, %rax ; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX1-64-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-64-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-64-NEXT: vmovq %xmm1, %rax ; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 ; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] @@ -1066,6 +1180,58 @@ ; AVX1-64-NEXT: vzeroupper ; AVX1-64-NEXT: retq ; +; AVX2-32-LABEL: uitofp_v4i64_v4f32: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: pushl %ebp +; AVX2-32-NEXT: .cfi_def_cfa_offset 8 +; AVX2-32-NEXT: .cfi_offset %ebp, -8 +; AVX2-32-NEXT: movl %esp, %ebp +; AVX2-32-NEXT: .cfi_def_cfa_register %ebp +; AVX2-32-NEXT: andl $-8, %esp +; AVX2-32-NEXT: subl $48, %esp +; AVX2-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX2-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vextractps $1, %xmm0, %eax +; AVX2-32-NEXT: shrl $31, %eax +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX2-32-NEXT: fstps (%esp) +; AVX2-32-NEXT: wait +; AVX2-32-NEXT: vextractps $3, %xmm0, %eax +; AVX2-32-NEXT: shrl $31, %eax +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX2-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX2-32-NEXT: wait +; AVX2-32-NEXT: vextractps $1, %xmm1, %eax +; AVX2-32-NEXT: shrl $31, %eax +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX2-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX2-32-NEXT: wait +; AVX2-32-NEXT: vextractps $3, %xmm1, %eax +; AVX2-32-NEXT: shrl $31, %eax +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX2-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX2-32-NEXT: wait +; AVX2-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-32-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX2-32-NEXT: movl %ebp, %esp +; AVX2-32-NEXT: popl %ebp +; AVX2-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX2-32-NEXT: vzeroupper +; AVX2-32-NEXT: retl +; ; AVX2-64-LABEL: uitofp_v4i64_v4f32: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1081,11 +1247,11 @@ ; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 ; AVX2-64-NEXT: vmovq %xmm0, %rax ; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; AVX2-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; AVX2-64-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-64-NEXT: vmovq %xmm0, %rax ; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; AVX2-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; AVX2-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 ; AVX2-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] @@ -1094,30 +1260,133 @@ ; AVX2-64-NEXT: vzeroupper ; AVX2-64-NEXT: retq ; +; AVX512F-32-LABEL: uitofp_v4i64_v4f32: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-8, %esp +; AVX512F-32-NEXT: subl $48, %esp +; AVX512F-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512F-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512F-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX512F-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vextractps $1, %xmm0, %eax +; AVX512F-32-NEXT: shrl $31, %eax +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512F-32-NEXT: fstps (%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vextractps $3, %xmm0, %eax +; AVX512F-32-NEXT: shrl $31, %eax +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vextractps $1, %xmm1, %eax +; AVX512F-32-NEXT: shrl $31, %eax +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vextractps $3, %xmm1, %eax +; AVX512F-32-NEXT: shrl $31, %eax +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512F-32-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512F-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: vzeroupper +; AVX512F-32-NEXT: retl +; ; AVX512F-64-LABEL: uitofp_v4i64_v4f32: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax ; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 ; AVX512F-64-NEXT: vmovq %xmm0, %rax ; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512F-64-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512F-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-64-NEXT: vmovq %xmm0, %rax ; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 -; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512F-64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax ; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 ; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512F-64-NEXT: vzeroupper ; AVX512F-64-NEXT: retq ; +; AVX512VL-32-LABEL: uitofp_v4i64_v4f32: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $48, %esp +; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VL-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512VL-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,0,1] +; AVX512VL-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextractps $1, %xmm0, %eax +; AVX512VL-32-NEXT: shrl $31, %eax +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512VL-32-NEXT: fstps (%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vextractps $3, %xmm0, %eax +; AVX512VL-32-NEXT: shrl $31, %eax +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vextractps $1, %xmm1, %eax +; AVX512VL-32-NEXT: shrl $31, %eax +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vextractps $3, %xmm1, %eax +; AVX512VL-32-NEXT: shrl $31, %eax +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512VL-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: vzeroupper +; AVX512VL-32-NEXT: retl +; ; AVX512VL-64-LABEL: uitofp_v4i64_v4f32: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax ; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 ; AVX512VL-64-NEXT: vmovq %xmm0, %rax ; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512VL-64-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512VL-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-64-NEXT: vmovq %xmm0, %rax ; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll @@ -433,12 +433,16 @@ ; NODQ-32-NEXT: fstps (%esp) ; NODQ-32-NEXT: wait ; NODQ-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; NODQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; NODQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; NODQ-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NODQ-32-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; NODQ-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NODQ-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; NODQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; NODQ-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; NODQ-32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; NODQ-32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; NODQ-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; NODQ-32-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; NODQ-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; NODQ-32-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; NODQ-32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] ; NODQ-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; NODQ-32-NEXT: movl %ebp, %esp @@ -453,11 +457,11 @@ ; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 ; NODQ-64-NEXT: vmovq %xmm1, %rax ; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 -; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; NODQ-64-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; NODQ-64-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; NODQ-64-NEXT: vmovq %xmm2, %rax ; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 -; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; NODQ-64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax ; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 ; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] @@ -465,11 +469,11 @@ ; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 ; NODQ-64-NEXT: vmovq %xmm0, %rax ; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; NODQ-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; NODQ-64-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; NODQ-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-64-NEXT: vmovq %xmm0, %rax ; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; NODQ-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; NODQ-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-64-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 ; NODQ-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] @@ -560,12 +564,16 @@ ; NODQ-32-NEXT: fstps {{[0-9]+}}(%esp) ; NODQ-32-NEXT: wait ; NODQ-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; NODQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; NODQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; NODQ-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NODQ-32-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; NODQ-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NODQ-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; NODQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; NODQ-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; NODQ-32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; NODQ-32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; NODQ-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; NODQ-32-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; NODQ-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; NODQ-32-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; NODQ-32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] ; NODQ-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; NODQ-32-NEXT: movl %ebp, %esp @@ -580,11 +588,11 @@ ; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 ; NODQ-64-NEXT: vmovq %xmm1, %rax ; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1 -; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; NODQ-64-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; NODQ-64-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; NODQ-64-NEXT: vmovq %xmm2, %rax ; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 -; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; NODQ-64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax ; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 ; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] @@ -592,11 +600,11 @@ ; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 ; NODQ-64-NEXT: vmovq %xmm0, %rax ; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 -; NODQ-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; NODQ-64-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; NODQ-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-64-NEXT: vmovq %xmm0, %rax ; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 -; NODQ-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; NODQ-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-64-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm4, %xmm0 ; NODQ-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] diff --git a/llvm/test/CodeGen/X86/vec_insert-2.ll b/llvm/test/CodeGen/X86/vec_insert-2.ll --- a/llvm/test/CodeGen/X86/vec_insert-2.ll +++ b/llvm/test/CodeGen/X86/vec_insert-2.ll @@ -6,13 +6,13 @@ ; X32-LABEL: t1: ; X32: # %bb.0: ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; X32-NEXT: retl ; ; X64-LABEL: t1: ; X64: # %bb.0: -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; X64-NEXT: movaps %xmm1, %xmm0 ; X64-NEXT: retq @@ -24,14 +24,14 @@ ; X32-LABEL: t2: ; X32: # %bb.0: ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; X32-NEXT: retl ; ; X64-LABEL: t2: ; X64: # %bb.0: ; X64-NEXT: movd %edi, %xmm1 -; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; X64-NEXT: retq %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 3 diff --git a/llvm/test/CodeGen/X86/vec_insert-3.ll b/llvm/test/CodeGen/X86/vec_insert-3.ll --- a/llvm/test/CodeGen/X86/vec_insert-3.ll +++ b/llvm/test/CodeGen/X86/vec_insert-3.ll @@ -6,11 +6,9 @@ ; X32-LABEL: t1: ; X32: # %bb.0: ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: movaps %xmm0, %xmm2 -; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0] -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; X32-NEXT: retl ; ; X64-LABEL: t1: diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll --- a/llvm/test/CodeGen/X86/vec_insert-5.ll +++ b/llvm/test/CodeGen/X86/vec_insert-5.ll @@ -11,7 +11,7 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shll $12, %ecx ; X32-NEXT: movd %ecx, %xmm0 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; X32-NEXT: psllq $32, %xmm0 ; X32-NEXT: movq %xmm0, (%eax) ; X32-NEXT: retl ; @@ -19,7 +19,7 @@ ; X64: # %bb.0: ; X64-NEXT: shll $12, %edi ; X64-NEXT: movd %edi, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; X64-NEXT: psllq $32, %xmm0 ; X64-NEXT: movq %xmm0, (%rsi) ; X64-NEXT: retq %tmp12 = shl i32 %a, 12 @@ -34,17 +34,15 @@ ; X32-LABEL: t2: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movaps (%eax), %xmm1 +; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; X32-NEXT: xorps %xmm0, %xmm0 -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; X32-NEXT: retl ; ; X64-LABEL: t2: ; X64: # %bb.0: -; X64-NEXT: movaps (%rdi), %xmm1 +; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; X64-NEXT: retq %tmp1 = load <4 x float>, <4 x float>* %P diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -189,7 +189,7 @@ define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) { ; SSE2-LABEL: sitofp_2i16_to_2f64: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE2-NEXT: retq @@ -213,7 +213,7 @@ define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) { ; SSE2-LABEL: sitofp_8i16_to_2f64: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE2-NEXT: retq @@ -246,7 +246,7 @@ ; SSE2-LABEL: sitofp_2i8_to_2f64: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE2-NEXT: retq @@ -271,7 +271,7 @@ ; SSE2-LABEL: sitofp_16i8_to_2f64: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE2-NEXT: retq @@ -1593,11 +1593,11 @@ ; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; SSE41-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: xorps %xmm2, %xmm2 ; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE41-NEXT: pextrq $1, %xmm1, %rax ; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: cvtsi2ss %rax, %xmm1 @@ -1610,11 +1610,11 @@ ; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] @@ -1627,11 +1627,11 @@ ; AVX2-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] @@ -1644,11 +1644,11 @@ ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] @@ -1661,7 +1661,7 @@ ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 @@ -2159,11 +2159,11 @@ ; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX1-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm1 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] @@ -2185,11 +2185,11 @@ ; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vmovq %xmm1, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX2-NEXT: vpextrq $1, %xmm1, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm1 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] @@ -2541,7 +2541,7 @@ ; SSE41-NEXT: movq %xmm5, %rax ; SSE41-NEXT: xorps %xmm3, %xmm3 ; SSE41-NEXT: cvtsi2ss %rax, %xmm3 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[2,3] +; SSE41-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE41-NEXT: pand %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm2, %xmm5 ; SSE41-NEXT: psrlq $1, %xmm5 @@ -2552,7 +2552,7 @@ ; SSE41-NEXT: movq %xmm2, %rax ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0],xmm3[3] +; SSE41-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE41-NEXT: pextrq $1, %xmm2, %rax ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 @@ -2577,11 +2577,11 @@ ; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; AVX1-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] @@ -2606,11 +2606,11 @@ ; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] @@ -2625,11 +2625,11 @@ ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] @@ -2642,7 +2642,7 @@ ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 @@ -3080,7 +3080,7 @@ ; SSE2-NEXT: movzwl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE2-NEXT: retq @@ -3926,11 +3926,11 @@ ; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; SSE41-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: xorps %xmm2, %xmm2 ; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE41-NEXT: pextrq $1, %xmm1, %rax ; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: cvtsi2ss %rax, %xmm1 @@ -3945,10 +3945,10 @@ ; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 ; VEX-NEXT: vmovq %xmm0, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; VEX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; VEX-NEXT: vmovq %xmm1, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; VEX-NEXT: vpextrq $1, %xmm1, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] @@ -3962,10 +3962,10 @@ ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512F-NEXT: vmovq %xmm1, %rax ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] @@ -3979,7 +3979,7 @@ ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512VL-NEXT: vmovq %xmm1, %rax ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] @@ -4124,11 +4124,11 @@ ; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] +; SSE41-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: xorps %xmm4, %xmm4 ; SSE41-NEXT: cvtsi2ss %rax, %xmm4 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0],xmm0[3] +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE41-NEXT: pextrq $1, %xmm1, %rax ; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: cvtsi2ss %rax, %xmm1 @@ -4139,11 +4139,11 @@ ; SSE41-NEXT: movq %xmm2, %rax ; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3] +; SSE41-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE41-NEXT: movq %xmm3, %rax ; SSE41-NEXT: xorps %xmm2, %xmm2 ; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE41-NEXT: pextrq $1, %xmm3, %rax ; SSE41-NEXT: xorps %xmm2, %xmm2 ; SSE41-NEXT: cvtsi2ss %rax, %xmm2 @@ -4160,10 +4160,10 @@ ; VEX-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 ; VEX-NEXT: vmovq %xmm2, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 -; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; VEX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; VEX-NEXT: vmovq %xmm3, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; VEX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; VEX-NEXT: vpextrq $1, %xmm3, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 ; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] @@ -4171,10 +4171,10 @@ ; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 ; VEX-NEXT: vmovq %xmm0, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; VEX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; VEX-NEXT: vmovq %xmm1, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; VEX-NEXT: vpextrq $1, %xmm1, %rax ; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] @@ -4191,10 +4191,10 @@ ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 ; AVX512F-NEXT: vmovq %xmm2, %rax ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; AVX512F-NEXT: vmovq %xmm3, %rax ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; AVX512F-NEXT: vpextrq $1, %xmm3, %rax ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] @@ -4202,10 +4202,10 @@ ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX512F-NEXT: vmovq %xmm1, %rax ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] @@ -4222,7 +4222,7 @@ ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 ; AVX512VL-NEXT: vmovq %xmm2, %rax ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; AVX512VL-NEXT: vmovq %xmm3, %rax ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] @@ -4233,7 +4233,7 @@ ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX512VL-NEXT: vmovq %xmm1, %rax ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] @@ -4457,7 +4457,7 @@ ; SSE41-NEXT: movq %xmm5, %rax ; SSE41-NEXT: xorps %xmm3, %xmm3 ; SSE41-NEXT: cvtsi2ss %rax, %xmm3 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[2,3] +; SSE41-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE41-NEXT: pand %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm2, %xmm5 ; SSE41-NEXT: psrlq $1, %xmm5 @@ -4468,7 +4468,7 @@ ; SSE41-NEXT: movq %xmm2, %rax ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0],xmm3[3] +; SSE41-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE41-NEXT: pextrq $1, %xmm2, %rax ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 @@ -4495,11 +4495,11 @@ ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] @@ -4525,11 +4525,11 @@ ; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] @@ -4546,10 +4546,10 @@ ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512F-NEXT: vmovq %xmm1, %rax ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] @@ -4563,7 +4563,7 @@ ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512VL-NEXT: vmovq %xmm1, %rax ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] @@ -4877,7 +4877,7 @@ ; SSE41-NEXT: movq %xmm3, %rax ; SSE41-NEXT: xorps %xmm3, %xmm3 ; SSE41-NEXT: cvtsi2ss %rax, %xmm3 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[2,3] +; SSE41-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: movdqa %xmm5, %xmm1 @@ -4889,7 +4889,7 @@ ; SSE41-NEXT: movq %xmm5, %rax ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0],xmm3[3] +; SSE41-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE41-NEXT: pextrq $1, %xmm5, %rax ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 @@ -4912,7 +4912,7 @@ ; SSE41-NEXT: movq %xmm4, %rax ; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3] +; SSE41-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE41-NEXT: pand %xmm2, %xmm7 ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: psrlq $1, %xmm4 @@ -4923,7 +4923,7 @@ ; SSE41-NEXT: movq %xmm2, %rax ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3] +; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE41-NEXT: pextrq $1, %xmm2, %rax ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 @@ -4954,11 +4954,11 @@ ; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm3 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm5 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[2,3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm5 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] @@ -4975,11 +4975,11 @@ ; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm3 ; AVX1-NEXT: vmovq %xmm2, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vmovq %xmm2, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; AVX1-NEXT: vpextrq $1, %xmm2, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] @@ -5006,11 +5006,11 @@ ; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm5 ; AVX2-NEXT: vmovq %xmm1, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm6 -; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[2,3] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vmovq %xmm1, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm6 -; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] ; AVX2-NEXT: vpextrq $1, %xmm1, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm1 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[0] @@ -5027,11 +5027,11 @@ ; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm3 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm4 -; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm4 -; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] @@ -5050,10 +5050,10 @@ ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4 ; AVX512F-NEXT: vmovq %xmm2, %rax ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; AVX512F-NEXT: vmovq %xmm3, %rax ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; AVX512F-NEXT: vpextrq $1, %xmm3, %rax ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] @@ -5061,10 +5061,10 @@ ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX512F-NEXT: vmovq %xmm1, %rax ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] @@ -5081,7 +5081,7 @@ ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4 ; AVX512VL-NEXT: vmovq %xmm2, %rax ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; AVX512VL-NEXT: vmovq %xmm3, %rax ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] @@ -5092,7 +5092,7 @@ ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX512VL-NEXT: vmovq %xmm1, %rax ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -480,8 +480,8 @@ ; SSE2-NEXT: paddd %xmm5, %xmm0 ; SSE2-NEXT: pmuludq %xmm6, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,0],xmm10[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm8[0,0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm10[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm8[0] ; SSE2-NEXT: pmuludq %xmm12, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] @@ -552,8 +552,8 @@ ; SSSE3-NEXT: paddd %xmm5, %xmm0 ; SSSE3-NEXT: pmuludq %xmm6, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,0],xmm10[0,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm8[0,0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm10[0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm8[0] ; SSSE3-NEXT: pmuludq %xmm12, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -423,8 +423,8 @@ ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: pmuludq %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,3,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm2[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm4[0,0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE2-NEXT: pmuludq %xmm3, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] @@ -473,8 +473,8 @@ ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: pmuludq %xmm0, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,3,2,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm2[0,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm4[0,0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSSE3-NEXT: pmuludq %xmm3, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -6642,11 +6642,11 @@ ; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] @@ -6659,11 +6659,11 @@ ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] @@ -7497,11 +7497,11 @@ ; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; AVX1-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] @@ -7517,11 +7517,11 @@ ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -22,619 +22,1162 @@ } define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind { -; ALL-LABEL: cvt_4i16_to_4f32: -; ALL: # %bb.0: -; ALL-NEXT: vmovq %xmm0, %rax -; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: movq %rax, %rdx -; ALL-NEXT: movswl %ax, %esi -; ALL-NEXT: # kill: def $eax killed $eax killed $rax -; ALL-NEXT: shrl $16, %eax -; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: shrq $48, %rdx -; ALL-NEXT: movswl %dx, %edx -; ALL-NEXT: vmovd %edx, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: movswl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: cwtl -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vmovd %esi, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; ALL-NEXT: retq - %1 = bitcast <4 x i16> %a0 to <4 x half> - %2 = fpext <4 x half> %1 to <4 x float> - ret <4 x float> %2 -} - -define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind { -; ALL-LABEL: cvt_8i16_to_4f32: -; ALL: # %bb.0: -; ALL-NEXT: vmovq %xmm0, %rax -; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: movq %rax, %rdx -; ALL-NEXT: movswl %ax, %esi -; ALL-NEXT: # kill: def $eax killed $eax killed $rax -; ALL-NEXT: shrl $16, %eax -; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: shrq $48, %rdx -; ALL-NEXT: movswl %dx, %edx -; ALL-NEXT: vmovd %edx, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: movswl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: cwtl -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vmovd %esi, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; ALL-NEXT: retq - %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> - %2 = bitcast <4 x i16> %1 to <4 x half> - %3 = fpext <4 x half> %2 to <4 x float> - ret <4 x float> %3 -} - -define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind { -; ALL-LABEL: cvt_8i16_to_8f32: -; ALL: # %bb.0: -; ALL-NEXT: vpextrq $1, %xmm0, %rdx -; ALL-NEXT: movq %rdx, %r8 -; ALL-NEXT: movq %rdx, %r10 -; ALL-NEXT: movswl %dx, %r9d -; ALL-NEXT: # kill: def $edx killed $edx killed $rdx -; ALL-NEXT: shrl $16, %edx -; ALL-NEXT: shrq $32, %r8 -; ALL-NEXT: shrq $48, %r10 -; ALL-NEXT: vmovq %xmm0, %rdi -; ALL-NEXT: movq %rdi, %rax -; ALL-NEXT: movq %rdi, %rsi -; ALL-NEXT: movswl %di, %ecx -; ALL-NEXT: # kill: def $edi killed $edi killed $rdi -; ALL-NEXT: shrl $16, %edi -; ALL-NEXT: shrq $32, %rax -; ALL-NEXT: shrq $48, %rsi -; ALL-NEXT: movswl %si, %esi -; ALL-NEXT: vmovd %esi, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: cwtl -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: movswl %di, %eax -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vmovd %ecx, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: movswl %r10w, %eax -; ALL-NEXT: vmovd %eax, %xmm4 -; ALL-NEXT: vcvtph2ps %xmm4, %xmm4 -; ALL-NEXT: movswl %r8w, %eax -; ALL-NEXT: vmovd %eax, %xmm5 -; ALL-NEXT: vcvtph2ps %xmm5, %xmm5 -; ALL-NEXT: movswl %dx, %eax -; ALL-NEXT: vmovd %eax, %xmm6 -; ALL-NEXT: vcvtph2ps %xmm6, %xmm6 -; ALL-NEXT: vmovd %r9d, %xmm7 -; ALL-NEXT: vcvtph2ps %xmm7, %xmm7 -; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; ALL-NEXT: retq - %1 = bitcast <8 x i16> %a0 to <8 x half> - %2 = fpext <8 x half> %1 to <8 x float> - ret <8 x float> %2 -} - -define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { -; AVX1-LABEL: cvt_16i16_to_16f32: +; AVX1-LABEL: cvt_4i16_to_4f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovq %xmm4, %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $48, %rcx -; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm8 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $32, %rcx -; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm9 -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: # kill: def $eax killed $eax killed $rax -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: vmovd %eax, %xmm10 -; AVX1-NEXT: vpextrq $1, %xmm4, %rax -; AVX1-NEXT: vmovd %ecx, %xmm11 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $48, %rcx -; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm12 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $32, %rcx -; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm13 -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: # kill: def $eax killed $eax killed $rax -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: vmovd %eax, %xmm14 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vmovd %ecx, %xmm15 ; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $48, %rcx -; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $32, %rcx -; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm3 -; AVX1-NEXT: movswl %ax, %ecx +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: movswl %ax, %esi ; AVX1-NEXT: # kill: def $eax killed $eax killed $rax ; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: vmovd %eax, %xmm4 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $48, %rcx -; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm5 -; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq $32, %rcx +; AVX1-NEXT: shrq $48, %rdx +; AVX1-NEXT: movswl %dx, %edx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm6 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm7 +; AVX1-NEXT: vmovd %ecx, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX1-NEXT: cwtl -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm8, %xmm8 -; AVX1-NEXT: vcvtph2ps %xmm9, %xmm9 -; AVX1-NEXT: vcvtph2ps %xmm10, %xmm10 -; AVX1-NEXT: vcvtph2ps %xmm11, %xmm11 -; AVX1-NEXT: vcvtph2ps %xmm12, %xmm12 -; AVX1-NEXT: vcvtph2ps %xmm13, %xmm13 -; AVX1-NEXT: vcvtph2ps %xmm14, %xmm14 -; AVX1-NEXT: vcvtph2ps %xmm15, %xmm15 +; AVX1-NEXT: vmovd %eax, %xmm2 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: vmovd %esi, %xmm3 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: retq ; -; AVX2-LABEL: cvt_16i16_to_16f32: +; AVX2-LABEL: cvt_4i16_to_4f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovq %xmm4, %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $48, %rcx -; AVX2-NEXT: movswl %cx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm8 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $32, %rcx -; AVX2-NEXT: movswl %cx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm9 -; AVX2-NEXT: movswl %ax, %ecx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: cwtl -; AVX2-NEXT: vmovd %eax, %xmm10 -; AVX2-NEXT: vpextrq $1, %xmm4, %rax -; AVX2-NEXT: vmovd %ecx, %xmm11 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $48, %rcx -; AVX2-NEXT: movswl %cx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm12 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $32, %rcx -; AVX2-NEXT: movswl %cx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm13 -; AVX2-NEXT: movswl %ax, %ecx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: cwtl -; AVX2-NEXT: vmovd %eax, %xmm14 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vmovd %ecx, %xmm15 ; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $48, %rcx -; AVX2-NEXT: movswl %cx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm2 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $32, %rcx -; AVX2-NEXT: movswl %cx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm3 -; AVX2-NEXT: movswl %ax, %ecx +; AVX2-NEXT: movq %rax, %rdx +; AVX2-NEXT: movswl %ax, %esi ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: cwtl -; AVX2-NEXT: vmovd %eax, %xmm4 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $48, %rcx -; AVX2-NEXT: movswl %cx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm5 -; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq $32, %rcx +; AVX2-NEXT: shrq $48, %rdx +; AVX2-NEXT: movswl %dx, %edx +; AVX2-NEXT: vmovd %edx, %xmm0 +; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX2-NEXT: movswl %cx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm6 -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl $16, %ecx -; AVX2-NEXT: movswl %cx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm7 +; AVX2-NEXT: vmovd %ecx, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX2-NEXT: cwtl -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm8, %xmm8 -; AVX2-NEXT: vcvtph2ps %xmm9, %xmm9 -; AVX2-NEXT: vcvtph2ps %xmm10, %xmm10 -; AVX2-NEXT: vcvtph2ps %xmm11, %xmm11 -; AVX2-NEXT: vcvtph2ps %xmm12, %xmm12 -; AVX2-NEXT: vcvtph2ps %xmm13, %xmm13 -; AVX2-NEXT: vcvtph2ps %xmm14, %xmm14 -; AVX2-NEXT: vcvtph2ps %xmm15, %xmm15 +; AVX2-NEXT: vmovd %eax, %xmm2 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX2-NEXT: vmovd %esi, %xmm3 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: retq ; -; AVX512F-LABEL: cvt_16i16_to_16f32: +; AVX512F-LABEL: cvt_4i16_to_4f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm10 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: shrq $48, %rcx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm8 -; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: shrq $32, %rcx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm9 -; AVX512F-NEXT: movswl %ax, %ecx +; AVX512F-NEXT: movq %rax, %rdx +; AVX512F-NEXT: movswl %ax, %esi ; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512F-NEXT: shrl $16, %eax -; AVX512F-NEXT: cwtl -; AVX512F-NEXT: vmovd %eax, %xmm11 -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vmovd %ecx, %xmm12 -; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: shrq $48, %rcx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm13 -; AVX512F-NEXT: movq %rax, %rcx ; AVX512F-NEXT: shrq $32, %rcx +; AVX512F-NEXT: shrq $48, %rdx +; AVX512F-NEXT: movswl %dx, %edx +; AVX512F-NEXT: vmovd %edx, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm14 -; AVX512F-NEXT: movswl %ax, %ecx -; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512F-NEXT: shrl $16, %eax +; AVX512F-NEXT: vmovd %ecx, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512F-NEXT: cwtl -; AVX512F-NEXT: vmovd %eax, %xmm15 -; AVX512F-NEXT: vmovq %xmm10, %rax -; AVX512F-NEXT: vmovd %ecx, %xmm2 -; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: shrq $48, %rcx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm3 -; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: shrq $32, %rcx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm1 -; AVX512F-NEXT: movswl %ax, %ecx -; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512F-NEXT: shrl $16, %eax -; AVX512F-NEXT: cwtl -; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vpextrq $1, %xmm10, %rax -; AVX512F-NEXT: vmovd %ecx, %xmm10 -; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: shrq $48, %rcx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm5 -; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: shrq $32, %rcx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm6 -; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $16, %ecx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm7 -; AVX512F-NEXT: cwtl -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %xmm8, %xmm8 -; AVX512F-NEXT: vcvtph2ps %xmm9, %xmm9 -; AVX512F-NEXT: vcvtph2ps %xmm11, %xmm11 -; AVX512F-NEXT: vcvtph2ps %xmm12, %xmm12 -; AVX512F-NEXT: vcvtph2ps %xmm13, %xmm13 -; AVX512F-NEXT: vcvtph2ps %xmm14, %xmm14 -; AVX512F-NEXT: vcvtph2ps %xmm15, %xmm15 +; AVX512F-NEXT: vmovd %eax, %xmm2 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vmovd %esi, %xmm3 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512F-NEXT: vcvtph2ps %xmm10, %xmm10 -; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: cvt_16i16_to_16f32: +; AVX512VL-LABEL: cvt_4i16_to_4f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm10 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $48, %rcx -; AVX512VL-NEXT: movswl %cx, %ecx -; AVX512VL-NEXT: vmovd %ecx, %xmm8 -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: movswl %cx, %ecx -; AVX512VL-NEXT: vmovd %ecx, %xmm9 -; AVX512VL-NEXT: movswl %ax, %ecx +; AVX512VL-NEXT: movq %rax, %rdx +; AVX512VL-NEXT: movswl %ax, %esi ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512VL-NEXT: shrl $16, %eax -; AVX512VL-NEXT: cwtl -; AVX512VL-NEXT: vmovd %eax, %xmm11 -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vmovd %ecx, %xmm12 -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $48, %rcx -; AVX512VL-NEXT: movswl %cx, %ecx -; AVX512VL-NEXT: vmovd %ecx, %xmm13 -; AVX512VL-NEXT: movq %rax, %rcx ; AVX512VL-NEXT: shrq $32, %rcx +; AVX512VL-NEXT: shrq $48, %rdx +; AVX512VL-NEXT: movswl %dx, %edx +; AVX512VL-NEXT: vmovd %edx, %xmm0 +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512VL-NEXT: movswl %cx, %ecx -; AVX512VL-NEXT: vmovd %ecx, %xmm14 -; AVX512VL-NEXT: movswl %ax, %ecx -; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: vmovd %ecx, %xmm1 +; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512VL-NEXT: cwtl -; AVX512VL-NEXT: vmovd %eax, %xmm15 -; AVX512VL-NEXT: vmovq %xmm10, %rax -; AVX512VL-NEXT: vmovd %ecx, %xmm16 -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $48, %rcx -; AVX512VL-NEXT: movswl %cx, %ecx -; AVX512VL-NEXT: vmovd %ecx, %xmm17 +; AVX512VL-NEXT: vmovd %eax, %xmm2 +; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512VL-NEXT: vmovd %esi, %xmm3 +; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512VL-NEXT: retq + %1 = bitcast <4 x i16> %a0 to <4 x half> + %2 = fpext <4 x half> %1 to <4 x float> + ret <4 x float> %2 +} + +define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind { +; AVX1-LABEL: cvt_8i16_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: movswl %ax, %esi +; AVX1-NEXT: # kill: def $eax killed $eax killed $rax +; AVX1-NEXT: shrl $16, %eax +; AVX1-NEXT: shrq $32, %rcx +; AVX1-NEXT: shrq $48, %rdx +; AVX1-NEXT: movswl %dx, %edx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX1-NEXT: movswl %cx, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: vmovd %esi, %xmm3 +; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: cvt_8i16_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: movq %rax, %rdx +; AVX2-NEXT: movswl %ax, %esi +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: shrq $32, %rcx +; AVX2-NEXT: shrq $48, %rdx +; AVX2-NEXT: movswl %dx, %edx +; AVX2-NEXT: vmovd %edx, %xmm0 +; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX2-NEXT: movswl %cx, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX2-NEXT: vmovd %esi, %xmm3 +; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: cvt_8i16_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: movq %rax, %rdx +; AVX512F-NEXT: movswl %ax, %esi +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: shrl $16, %eax +; AVX512F-NEXT: shrq $32, %rcx +; AVX512F-NEXT: shrq $48, %rdx +; AVX512F-NEXT: movswl %dx, %edx +; AVX512F-NEXT: vmovd %edx, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: movswl %cx, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vmovd %esi, %xmm3 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: cvt_8i16_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: movswl %cx, %ecx -; AVX512VL-NEXT: vmovd %ecx, %xmm18 -; AVX512VL-NEXT: movswl %ax, %ecx +; AVX512VL-NEXT: movq %rax, %rdx +; AVX512VL-NEXT: movswl %ax, %esi ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512VL-NEXT: shrl $16, %eax -; AVX512VL-NEXT: cwtl -; AVX512VL-NEXT: vmovd %eax, %xmm19 -; AVX512VL-NEXT: vpextrq $1, %xmm10, %rax -; AVX512VL-NEXT: vmovd %ecx, %xmm10 -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $48, %rcx -; AVX512VL-NEXT: movswl %cx, %ecx -; AVX512VL-NEXT: vmovd %ecx, %xmm20 -; AVX512VL-NEXT: movq %rax, %rcx ; AVX512VL-NEXT: shrq $32, %rcx +; AVX512VL-NEXT: shrq $48, %rdx +; AVX512VL-NEXT: movswl %dx, %edx +; AVX512VL-NEXT: vmovd %edx, %xmm0 +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512VL-NEXT: movswl %cx, %ecx -; AVX512VL-NEXT: vmovd %ecx, %xmm21 -; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $16, %ecx -; AVX512VL-NEXT: movswl %cx, %ecx -; AVX512VL-NEXT: vmovd %ecx, %xmm22 +; AVX512VL-NEXT: vmovd %ecx, %xmm1 +; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512VL-NEXT: cwtl ; AVX512VL-NEXT: vmovd %eax, %xmm2 -; AVX512VL-NEXT: vcvtph2ps %xmm8, %xmm8 -; AVX512VL-NEXT: vcvtph2ps %xmm9, %xmm9 -; AVX512VL-NEXT: vcvtph2ps %xmm11, %xmm11 -; AVX512VL-NEXT: vcvtph2ps %xmm12, %xmm12 -; AVX512VL-NEXT: vcvtph2ps %xmm13, %xmm13 -; AVX512VL-NEXT: vcvtph2ps %xmm14, %xmm14 -; AVX512VL-NEXT: vcvtph2ps %xmm15, %xmm15 -; AVX512VL-NEXT: vcvtph2ps %xmm16, %xmm16 -; AVX512VL-NEXT: vcvtph2ps %xmm17, %xmm4 -; AVX512VL-NEXT: vcvtph2ps %xmm18, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm19, %xmm5 -; AVX512VL-NEXT: vcvtph2ps %xmm10, %xmm7 -; AVX512VL-NEXT: vcvtph2ps %xmm20, %xmm3 -; AVX512VL-NEXT: vcvtph2ps %xmm21, %xmm6 -; AVX512VL-NEXT: vcvtph2ps %xmm22, %xmm1 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[2,3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[2,3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-NEXT: vmovd %esi, %xmm3 +; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512VL-NEXT: retq - %1 = bitcast <16 x i16> %a0 to <16 x half> - %2 = fpext <16 x half> %1 to <16 x float> - ret <16 x float> %2 -} - -; -; Half to Float (Load) -; - -define float @load_cvt_i16_to_f32(i16* %a0) nounwind { -; ALL-LABEL: load_cvt_i16_to_f32: -; ALL: # %bb.0: -; ALL-NEXT: movswl (%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: retq - %1 = load i16, i16* %a0 - %2 = bitcast i16 %1 to half - %3 = fpext half %2 to float - ret float %3 -} - -define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind { -; ALL-LABEL: load_cvt_4i16_to_4f32: -; ALL: # %bb.0: -; ALL-NEXT: movswl 6(%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: movswl 4(%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: movswl (%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: movswl 2(%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; ALL-NEXT: retq - %1 = load <4 x i16>, <4 x i16>* %a0 + %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> %2 = bitcast <4 x i16> %1 to <4 x half> %3 = fpext <4 x half> %2 to <4 x float> ret <4 x float> %3 } -define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind { -; ALL-LABEL: load_cvt_8i16_to_4f32: -; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: movq %rax, %rdx -; ALL-NEXT: movswl %ax, %esi -; ALL-NEXT: # kill: def $eax killed $eax killed $rax -; ALL-NEXT: shrl $16, %eax -; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: shrq $48, %rdx -; ALL-NEXT: movswl %dx, %edx -; ALL-NEXT: vmovd %edx, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: movswl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: cwtl -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vmovd %esi, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; ALL-NEXT: retq - %1 = load <8 x i16>, <8 x i16>* %a0 - %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> - %3 = bitcast <4 x i16> %2 to <4 x half> - %4 = fpext <4 x half> %3 to <4 x float> +define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind { +; AVX1-LABEL: cvt_8i16_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpextrq $1, %xmm0, %rdx +; AVX1-NEXT: movq %rdx, %r8 +; AVX1-NEXT: movq %rdx, %r10 +; AVX1-NEXT: movswl %dx, %r9d +; AVX1-NEXT: # kill: def $edx killed $edx killed $rdx +; AVX1-NEXT: shrl $16, %edx +; AVX1-NEXT: shrq $32, %r8 +; AVX1-NEXT: shrq $48, %r10 +; AVX1-NEXT: vmovq %xmm0, %rdi +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: movq %rdi, %rsi +; AVX1-NEXT: movswl %di, %ecx +; AVX1-NEXT: # kill: def $edi killed $edi killed $rdi +; AVX1-NEXT: shrl $16, %edi +; AVX1-NEXT: shrq $32, %rax +; AVX1-NEXT: shrq $48, %rsi +; AVX1-NEXT: movswl %si, %esi +; AVX1-NEXT: vmovd %esi, %xmm0 +; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX1-NEXT: movswl %di, %eax +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: vmovd %ecx, %xmm3 +; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX1-NEXT: movswl %r10w, %eax +; AVX1-NEXT: vmovd %eax, %xmm4 +; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX1-NEXT: movswl %r8w, %eax +; AVX1-NEXT: vmovd %eax, %xmm5 +; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX1-NEXT: movswl %dx, %eax +; AVX1-NEXT: vmovd %eax, %xmm6 +; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX1-NEXT: vmovd %r9d, %xmm7 +; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm5[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: cvt_8i16_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpextrq $1, %xmm0, %rdx +; AVX2-NEXT: movq %rdx, %r8 +; AVX2-NEXT: movq %rdx, %r10 +; AVX2-NEXT: movswl %dx, %r9d +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx +; AVX2-NEXT: shrl $16, %edx +; AVX2-NEXT: shrq $32, %r8 +; AVX2-NEXT: shrq $48, %r10 +; AVX2-NEXT: vmovq %xmm0, %rdi +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: movq %rdi, %rsi +; AVX2-NEXT: movswl %di, %ecx +; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi +; AVX2-NEXT: shrl $16, %edi +; AVX2-NEXT: shrq $32, %rax +; AVX2-NEXT: shrq $48, %rsi +; AVX2-NEXT: movswl %si, %esi +; AVX2-NEXT: vmovd %esi, %xmm0 +; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-NEXT: movswl %di, %eax +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX2-NEXT: vmovd %ecx, %xmm3 +; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX2-NEXT: movswl %r10w, %eax +; AVX2-NEXT: vmovd %eax, %xmm4 +; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX2-NEXT: movswl %r8w, %eax +; AVX2-NEXT: vmovd %eax, %xmm5 +; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX2-NEXT: movswl %dx, %eax +; AVX2-NEXT: vmovd %eax, %xmm6 +; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX2-NEXT: vmovd %r9d, %xmm7 +; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm5[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: cvt_8i16_to_8f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512F-NEXT: movq %rdx, %r8 +; AVX512F-NEXT: movq %rdx, %r10 +; AVX512F-NEXT: movswl %dx, %r9d +; AVX512F-NEXT: # kill: def $edx killed $edx killed $rdx +; AVX512F-NEXT: shrl $16, %edx +; AVX512F-NEXT: shrq $32, %r8 +; AVX512F-NEXT: shrq $48, %r10 +; AVX512F-NEXT: vmovq %xmm0, %rdi +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: movq %rdi, %rsi +; AVX512F-NEXT: movswl %di, %ecx +; AVX512F-NEXT: # kill: def $edi killed $edi killed $rdi +; AVX512F-NEXT: shrl $16, %edi +; AVX512F-NEXT: shrq $32, %rax +; AVX512F-NEXT: shrq $48, %rsi +; AVX512F-NEXT: movswl %si, %esi +; AVX512F-NEXT: vmovd %esi, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: movswl %di, %eax +; AVX512F-NEXT: vmovd %eax, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vmovd %ecx, %xmm3 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512F-NEXT: movswl %r10w, %eax +; AVX512F-NEXT: vmovd %eax, %xmm4 +; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512F-NEXT: movswl %r8w, %eax +; AVX512F-NEXT: vmovd %eax, %xmm5 +; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX512F-NEXT: movswl %dx, %eax +; AVX512F-NEXT: vmovd %eax, %xmm6 +; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX512F-NEXT: vmovd %r9d, %xmm7 +; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm5[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: cvt_8i16_to_8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512VL-NEXT: movq %rdx, %r8 +; AVX512VL-NEXT: movq %rdx, %r10 +; AVX512VL-NEXT: movswl %dx, %r9d +; AVX512VL-NEXT: # kill: def $edx killed $edx killed $rdx +; AVX512VL-NEXT: shrl $16, %edx +; AVX512VL-NEXT: shrq $32, %r8 +; AVX512VL-NEXT: shrq $48, %r10 +; AVX512VL-NEXT: vmovq %xmm0, %rdi +; AVX512VL-NEXT: movq %rdi, %rax +; AVX512VL-NEXT: movq %rdi, %rsi +; AVX512VL-NEXT: movswl %di, %ecx +; AVX512VL-NEXT: # kill: def $edi killed $edi killed $rdi +; AVX512VL-NEXT: shrl $16, %edi +; AVX512VL-NEXT: shrq $32, %rax +; AVX512VL-NEXT: shrq $48, %rsi +; AVX512VL-NEXT: movswl %si, %esi +; AVX512VL-NEXT: vmovd %esi, %xmm0 +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm1 +; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512VL-NEXT: movswl %di, %eax +; AVX512VL-NEXT: vmovd %eax, %xmm2 +; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512VL-NEXT: vmovd %ecx, %xmm3 +; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512VL-NEXT: movswl %r10w, %eax +; AVX512VL-NEXT: vmovd %eax, %xmm4 +; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512VL-NEXT: movswl %r8w, %eax +; AVX512VL-NEXT: vmovd %eax, %xmm5 +; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX512VL-NEXT: movswl %dx, %eax +; AVX512VL-NEXT: vmovd %eax, %xmm6 +; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX512VL-NEXT: vmovd %r9d, %xmm7 +; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX512VL-NEXT: retq + %1 = bitcast <8 x i16> %a0 to <8 x half> + %2 = fpext <8 x half> %1 to <8 x float> + ret <8 x float> %2 +} + +define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { +; AVX1-LABEL: cvt_16i16_to_16f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vmovq %xmm4, %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq $48, %rcx +; AVX1-NEXT: movswl %cx, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm8 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq $32, %rcx +; AVX1-NEXT: movswl %cx, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm9 +; AVX1-NEXT: movswl %ax, %ecx +; AVX1-NEXT: # kill: def $eax killed $eax killed $rax +; AVX1-NEXT: shrl $16, %eax +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm10 +; AVX1-NEXT: vpextrq $1, %xmm4, %rax +; AVX1-NEXT: vmovd %ecx, %xmm11 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq $48, %rcx +; AVX1-NEXT: movswl %cx, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm12 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq $32, %rcx +; AVX1-NEXT: movswl %cx, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm13 +; AVX1-NEXT: movswl %ax, %ecx +; AVX1-NEXT: # kill: def $eax killed $eax killed $rax +; AVX1-NEXT: shrl $16, %eax +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm14 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vmovd %ecx, %xmm15 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq $48, %rcx +; AVX1-NEXT: movswl %cx, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq $32, %rcx +; AVX1-NEXT: movswl %cx, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm3 +; AVX1-NEXT: movswl %ax, %ecx +; AVX1-NEXT: # kill: def $eax killed $eax killed $rax +; AVX1-NEXT: shrl $16, %eax +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm4 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq $48, %rcx +; AVX1-NEXT: movswl %cx, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm5 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq $32, %rcx +; AVX1-NEXT: movswl %cx, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm6 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $16, %ecx +; AVX1-NEXT: movswl %cx, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm7 +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm8, %xmm8 +; AVX1-NEXT: vcvtph2ps %xmm9, %xmm9 +; AVX1-NEXT: vcvtph2ps %xmm10, %xmm10 +; AVX1-NEXT: vcvtph2ps %xmm11, %xmm11 +; AVX1-NEXT: vcvtph2ps %xmm12, %xmm12 +; AVX1-NEXT: vcvtph2ps %xmm13, %xmm13 +; AVX1-NEXT: vcvtph2ps %xmm14, %xmm14 +; AVX1-NEXT: vcvtph2ps %xmm15, %xmm15 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: cvt_16i16_to_16f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vmovq %xmm4, %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq $48, %rcx +; AVX2-NEXT: movswl %cx, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm8 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq $32, %rcx +; AVX2-NEXT: movswl %cx, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm9 +; AVX2-NEXT: movswl %ax, %ecx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm10 +; AVX2-NEXT: vpextrq $1, %xmm4, %rax +; AVX2-NEXT: vmovd %ecx, %xmm11 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq $48, %rcx +; AVX2-NEXT: movswl %cx, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm12 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq $32, %rcx +; AVX2-NEXT: movswl %cx, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm13 +; AVX2-NEXT: movswl %ax, %ecx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm14 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vmovd %ecx, %xmm15 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq $48, %rcx +; AVX2-NEXT: movswl %cx, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq $32, %rcx +; AVX2-NEXT: movswl %cx, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm3 +; AVX2-NEXT: movswl %ax, %ecx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm4 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq $48, %rcx +; AVX2-NEXT: movswl %cx, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm5 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq $32, %rcx +; AVX2-NEXT: movswl %cx, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm6 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $16, %ecx +; AVX2-NEXT: movswl %cx, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm7 +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm8, %xmm8 +; AVX2-NEXT: vcvtph2ps %xmm9, %xmm9 +; AVX2-NEXT: vcvtph2ps %xmm10, %xmm10 +; AVX2-NEXT: vcvtph2ps %xmm11, %xmm11 +; AVX2-NEXT: vcvtph2ps %xmm12, %xmm12 +; AVX2-NEXT: vcvtph2ps %xmm13, %xmm13 +; AVX2-NEXT: vcvtph2ps %xmm14, %xmm14 +; AVX2-NEXT: vcvtph2ps %xmm15, %xmm15 +; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: cvt_16i16_to_16f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $48, %rcx +; AVX512F-NEXT: movswl %cx, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm8 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $32, %rcx +; AVX512F-NEXT: movswl %cx, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm9 +; AVX512F-NEXT: movswl %ax, %ecx +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: shrl $16, %eax +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm11 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vmovd %ecx, %xmm12 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $48, %rcx +; AVX512F-NEXT: movswl %cx, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm13 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $32, %rcx +; AVX512F-NEXT: movswl %cx, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm14 +; AVX512F-NEXT: movswl %ax, %ecx +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: shrl $16, %eax +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm15 +; AVX512F-NEXT: vmovq %xmm10, %rax +; AVX512F-NEXT: vmovd %ecx, %xmm2 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $48, %rcx +; AVX512F-NEXT: movswl %cx, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm3 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $32, %rcx +; AVX512F-NEXT: movswl %cx, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm1 +; AVX512F-NEXT: movswl %ax, %ecx +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: shrl $16, %eax +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm4 +; AVX512F-NEXT: vpextrq $1, %xmm10, %rax +; AVX512F-NEXT: vmovd %ecx, %xmm10 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $48, %rcx +; AVX512F-NEXT: movswl %cx, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm5 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $32, %rcx +; AVX512F-NEXT: movswl %cx, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm6 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $16, %ecx +; AVX512F-NEXT: movswl %cx, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm7 +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm8, %xmm8 +; AVX512F-NEXT: vcvtph2ps %xmm9, %xmm9 +; AVX512F-NEXT: vcvtph2ps %xmm11, %xmm11 +; AVX512F-NEXT: vcvtph2ps %xmm12, %xmm12 +; AVX512F-NEXT: vcvtph2ps %xmm13, %xmm13 +; AVX512F-NEXT: vcvtph2ps %xmm14, %xmm14 +; AVX512F-NEXT: vcvtph2ps %xmm15, %xmm15 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512F-NEXT: vcvtph2ps %xmm10, %xmm10 +; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0] +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm1[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm14[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0] +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: cvt_16i16_to_16f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: movq %rax, %rcx +; AVX512VL-NEXT: shrq $48, %rcx +; AVX512VL-NEXT: movswl %cx, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm8 +; AVX512VL-NEXT: movq %rax, %rcx +; AVX512VL-NEXT: shrq $32, %rcx +; AVX512VL-NEXT: movswl %cx, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm9 +; AVX512VL-NEXT: movswl %ax, %ecx +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm11 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vmovd %ecx, %xmm12 +; AVX512VL-NEXT: movq %rax, %rcx +; AVX512VL-NEXT: shrq $48, %rcx +; AVX512VL-NEXT: movswl %cx, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm13 +; AVX512VL-NEXT: movq %rax, %rcx +; AVX512VL-NEXT: shrq $32, %rcx +; AVX512VL-NEXT: movswl %cx, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm14 +; AVX512VL-NEXT: movswl %ax, %ecx +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm15 +; AVX512VL-NEXT: vmovq %xmm10, %rax +; AVX512VL-NEXT: vmovd %ecx, %xmm16 +; AVX512VL-NEXT: movq %rax, %rcx +; AVX512VL-NEXT: shrq $48, %rcx +; AVX512VL-NEXT: movswl %cx, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm17 +; AVX512VL-NEXT: movq %rax, %rcx +; AVX512VL-NEXT: shrq $32, %rcx +; AVX512VL-NEXT: movswl %cx, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm18 +; AVX512VL-NEXT: movswl %ax, %ecx +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm19 +; AVX512VL-NEXT: vpextrq $1, %xmm10, %rax +; AVX512VL-NEXT: vmovd %ecx, %xmm10 +; AVX512VL-NEXT: movq %rax, %rcx +; AVX512VL-NEXT: shrq $48, %rcx +; AVX512VL-NEXT: movswl %cx, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm20 +; AVX512VL-NEXT: movq %rax, %rcx +; AVX512VL-NEXT: shrq $32, %rcx +; AVX512VL-NEXT: movswl %cx, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm21 +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrl $16, %ecx +; AVX512VL-NEXT: movswl %cx, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm22 +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm2 +; AVX512VL-NEXT: vcvtph2ps %xmm8, %xmm8 +; AVX512VL-NEXT: vcvtph2ps %xmm9, %xmm9 +; AVX512VL-NEXT: vcvtph2ps %xmm11, %xmm11 +; AVX512VL-NEXT: vcvtph2ps %xmm12, %xmm12 +; AVX512VL-NEXT: vcvtph2ps %xmm13, %xmm13 +; AVX512VL-NEXT: vcvtph2ps %xmm14, %xmm14 +; AVX512VL-NEXT: vcvtph2ps %xmm15, %xmm15 +; AVX512VL-NEXT: vcvtph2ps %xmm16, %xmm16 +; AVX512VL-NEXT: vcvtph2ps %xmm17, %xmm4 +; AVX512VL-NEXT: vcvtph2ps %xmm18, %xmm0 +; AVX512VL-NEXT: vcvtph2ps %xmm19, %xmm5 +; AVX512VL-NEXT: vcvtph2ps %xmm10, %xmm7 +; AVX512VL-NEXT: vcvtph2ps %xmm20, %xmm3 +; AVX512VL-NEXT: vcvtph2ps %xmm21, %xmm6 +; AVX512VL-NEXT: vcvtph2ps %xmm22, %xmm1 +; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[1],xmm15[1] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0] +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-NEXT: retq + %1 = bitcast <16 x i16> %a0 to <16 x half> + %2 = fpext <16 x half> %1 to <16 x float> + ret <16 x float> %2 +} + +; +; Half to Float (Load) +; + +define float @load_cvt_i16_to_f32(i16* %a0) nounwind { +; ALL-LABEL: load_cvt_i16_to_f32: +; ALL: # %bb.0: +; ALL-NEXT: movswl (%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: retq + %1 = load i16, i16* %a0 + %2 = bitcast i16 %1 to half + %3 = fpext half %2 to float + ret float %3 +} + +define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind { +; AVX1-LABEL: load_cvt_4i16_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: movswl 6(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX1-NEXT: movswl 4(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX1-NEXT: movswl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: movswl 2(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm3 +; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_cvt_4i16_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: movswl 6(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX2-NEXT: movswl 4(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-NEXT: movswl (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX2-NEXT: movswl 2(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm3 +; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_cvt_4i16_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movswl 6(%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: movswl 4(%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: movswl (%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: movswl 2(%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm3 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_cvt_4i16_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movswl 6(%rdi), %eax +; AVX512VL-NEXT: vmovd %eax, %xmm0 +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: movswl 4(%rdi), %eax +; AVX512VL-NEXT: vmovd %eax, %xmm1 +; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512VL-NEXT: movswl (%rdi), %eax +; AVX512VL-NEXT: vmovd %eax, %xmm2 +; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512VL-NEXT: movswl 2(%rdi), %eax +; AVX512VL-NEXT: vmovd %eax, %xmm3 +; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512VL-NEXT: retq + %1 = load <4 x i16>, <4 x i16>* %a0 + %2 = bitcast <4 x i16> %1 to <4 x half> + %3 = fpext <4 x half> %2 to <4 x float> + ret <4 x float> %3 +} + +define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind { +; AVX1-LABEL: load_cvt_8i16_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: movswl %ax, %esi +; AVX1-NEXT: # kill: def $eax killed $eax killed $rax +; AVX1-NEXT: shrl $16, %eax +; AVX1-NEXT: shrq $32, %rcx +; AVX1-NEXT: shrq $48, %rdx +; AVX1-NEXT: movswl %dx, %edx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX1-NEXT: movswl %cx, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: vmovd %esi, %xmm3 +; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_cvt_8i16_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: movq %rax, %rdx +; AVX2-NEXT: movswl %ax, %esi +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: shrq $32, %rcx +; AVX2-NEXT: shrq $48, %rdx +; AVX2-NEXT: movswl %dx, %edx +; AVX2-NEXT: vmovd %edx, %xmm0 +; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX2-NEXT: movswl %cx, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX2-NEXT: vmovd %esi, %xmm3 +; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_cvt_8i16_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movq (%rdi), %rax +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: movq %rax, %rdx +; AVX512F-NEXT: movswl %ax, %esi +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: shrl $16, %eax +; AVX512F-NEXT: shrq $32, %rcx +; AVX512F-NEXT: shrq $48, %rdx +; AVX512F-NEXT: movswl %dx, %edx +; AVX512F-NEXT: vmovd %edx, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: movswl %cx, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vmovd %esi, %xmm3 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_cvt_8i16_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movq (%rdi), %rax +; AVX512VL-NEXT: movq %rax, %rcx +; AVX512VL-NEXT: movq %rax, %rdx +; AVX512VL-NEXT: movswl %ax, %esi +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: shrq $32, %rcx +; AVX512VL-NEXT: shrq $48, %rdx +; AVX512VL-NEXT: movswl %dx, %edx +; AVX512VL-NEXT: vmovd %edx, %xmm0 +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: movswl %cx, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm1 +; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm2 +; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512VL-NEXT: vmovd %esi, %xmm3 +; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512VL-NEXT: retq + %1 = load <8 x i16>, <8 x i16>* %a0 + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> + %3 = bitcast <4 x i16> %2 to <4 x half> + %4 = fpext <4 x half> %3 to <4 x float> ret <4 x float> %4 } define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind { -; ALL-LABEL: load_cvt_8i16_to_8f32: -; ALL: # %bb.0: -; ALL-NEXT: movswl 6(%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: movswl 4(%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: movswl (%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: movswl 2(%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: movswl 14(%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm4 -; ALL-NEXT: vcvtph2ps %xmm4, %xmm4 -; ALL-NEXT: movswl 12(%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm5 -; ALL-NEXT: vcvtph2ps %xmm5, %xmm5 -; ALL-NEXT: movswl 8(%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm6 -; ALL-NEXT: vcvtph2ps %xmm6, %xmm6 -; ALL-NEXT: movswl 10(%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm7 -; ALL-NEXT: vcvtph2ps %xmm7, %xmm7 -; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: load_cvt_8i16_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: movswl 6(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX1-NEXT: movswl 4(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX1-NEXT: movswl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: movswl 2(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm3 +; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX1-NEXT: movswl 14(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm4 +; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX1-NEXT: movswl 12(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm5 +; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX1-NEXT: movswl 8(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm6 +; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX1-NEXT: movswl 10(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm7 +; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm5[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_cvt_8i16_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: movswl 6(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX2-NEXT: movswl 4(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-NEXT: movswl (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX2-NEXT: movswl 2(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm3 +; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX2-NEXT: movswl 14(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm4 +; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX2-NEXT: movswl 12(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm5 +; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX2-NEXT: movswl 8(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm6 +; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX2-NEXT: movswl 10(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm7 +; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm5[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_cvt_8i16_to_8f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movswl 6(%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: movswl 4(%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: movswl (%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: movswl 2(%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm3 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512F-NEXT: movswl 14(%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm4 +; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512F-NEXT: movswl 12(%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm5 +; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX512F-NEXT: movswl 8(%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm6 +; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX512F-NEXT: movswl 10(%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm7 +; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm5[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_cvt_8i16_to_8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movswl 6(%rdi), %eax +; AVX512VL-NEXT: vmovd %eax, %xmm0 +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: movswl 4(%rdi), %eax +; AVX512VL-NEXT: vmovd %eax, %xmm1 +; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512VL-NEXT: movswl (%rdi), %eax +; AVX512VL-NEXT: vmovd %eax, %xmm2 +; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512VL-NEXT: movswl 2(%rdi), %eax +; AVX512VL-NEXT: vmovd %eax, %xmm3 +; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512VL-NEXT: movswl 14(%rdi), %eax +; AVX512VL-NEXT: vmovd %eax, %xmm4 +; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512VL-NEXT: movswl 12(%rdi), %eax +; AVX512VL-NEXT: vmovd %eax, %xmm5 +; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX512VL-NEXT: movswl 8(%rdi), %eax +; AVX512VL-NEXT: vmovd %eax, %xmm6 +; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX512VL-NEXT: movswl 10(%rdi), %eax +; AVX512VL-NEXT: vmovd %eax, %xmm7 +; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = bitcast <8 x i16> %1 to <8 x half> %3 = fpext <8 x half> %2 to <8 x float> @@ -692,18 +1235,18 @@ ; AVX1-NEXT: movswl 10(%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm1 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0] ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: retq @@ -758,18 +1301,18 @@ ; AVX2-NEXT: movswl 10(%rdi), %eax ; AVX2-NEXT: vmovd %eax, %xmm1 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0] ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX2-NEXT: retq @@ -824,18 +1367,18 @@ ; AVX512F-NEXT: movswl 26(%rdi), %eax ; AVX512F-NEXT: vmovd %eax, %xmm7 ; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm5[0] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] +; AVX512F-NEXT: vunpcklps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] ; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 @@ -891,17 +1434,17 @@ ; AVX512VL-NEXT: movswl 26(%rdi), %eax ; AVX512VL-NEXT: vmovd %eax, %xmm7 ; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] +; AVX512VL-NEXT: vunpcklps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -275,7 +275,7 @@ ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4 ; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 @@ -711,7 +711,7 @@ ; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpsubb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm5 ; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -297,9 +297,8 @@ ; AVX1-LABEL: trunc_v8i32_v8i1: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -1813,12 +1813,12 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[2],zero,xmm0[4],zero,xmm0[6],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -291,9 +291,8 @@ ; AVX1-LABEL: trunc_v8i32_v8i1: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -306,9 +306,8 @@ ; AVX1-LABEL: trunc_v8i32_v8i1: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1744,19 +1744,19 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06(<16 x i8> %a) { ; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: ; SSE: # %bb.0: -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; SSE-NEXT: psrlq $8, %xmm0 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE-NEXT: retq ; ; AVX1-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; AVX1-NEXT: vpsrlq $8, %xmm0, %xmm0 ; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; AVX2-SLOW-NEXT: vpsrlq $8, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; AVX2-SLOW-NEXT: retq ; @@ -2183,13 +2183,11 @@ ; SSE2-NEXT: movzbl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: movzbl (%rsi), %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -332,19 +332,19 @@ define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: shuffle_v4i32_0124: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_0124: ; SSE3: # %bb.0: -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_0124: ; SSSE3: # %bb.0: -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSSE3-NEXT: retq ; @@ -354,11 +354,17 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v4i32_0124: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_0124: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i32_0124: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i32_0124: ; AVX512VL: # %bb.0: @@ -371,19 +377,19 @@ define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: shuffle_v4i32_0142: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_0142: ; SSE3: # %bb.0: -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_0142: ; SSSE3: # %bb.0: -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSSE3-NEXT: retq ; @@ -419,21 +425,21 @@ define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: shuffle_v4i32_0412: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_0412: ; SSE3: # %bb.0: -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2] ; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_0412: ; SSSE3: # %bb.0: -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq @@ -445,12 +451,19 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v4i32_0412: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] -; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_0412: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i32_0412: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i32_0412: ; AVX512VL: # %bb.0: @@ -463,21 +476,21 @@ define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: shuffle_v4i32_4012: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_4012: ; SSE3: # %bb.0: -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] ; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_4012: ; SSSE3: # %bb.0: -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq @@ -666,22 +679,22 @@ define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) { ; SSE2-LABEL: shuffle_v4f32_z4zz: ; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4f32_z4zz: ; SSE3: # %bb.0: +; SSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4f32_z4zz: ; SSSE3: # %bb.0: +; SSSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSSE3-NEXT: retq ; @@ -701,26 +714,23 @@ define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) { ; SSE2-LABEL: shuffle_v4f32_zz4z: ; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4f32_zz4z: ; SSE3: # %bb.0: -; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero +; SSE3-NEXT: pxor %xmm0, %xmm0 +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4f32_zz4z: ; SSSE3: # %bb.0: -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v4f32_zz4z: @@ -815,21 +825,21 @@ ; SSE2-LABEL: shuffle_v4f32_z6zz: ; SSE2: # %bb.0: ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4f32_z6zz: ; SSE3: # %bb.0: ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4f32_z6zz: ; SSSE3: # %bb.0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSSE3-NEXT: retq ; @@ -850,7 +860,7 @@ ; SSE2-LABEL: shuffle_v4f32_0z23: ; SSE2: # %bb.0: ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -858,7 +868,7 @@ ; SSE3-LABEL: shuffle_v4f32_0z23: ; SSE3: # %bb.0: ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq @@ -866,7 +876,7 @@ ; SSSE3-LABEL: shuffle_v4f32_0z23: ; SSSE3: # %bb.0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq @@ -1052,29 +1062,29 @@ define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: shuffle_v4f32_0zz4: ; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4f32_0zz4: ; SSE3: # %bb.0: -; SSE3-NEXT: xorps %xmm2, %xmm2 -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0] -; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE3-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE3-NEXT: movaps %xmm2, %xmm0 +; SSE3-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4f32_0zz4: ; SSSE3: # %bb.0: -; SSSE3-NEXT: xorps %xmm2, %xmm2 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v4f32_0zz4: @@ -1136,27 +1146,27 @@ define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: shuffle_v4f32_0z24: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4f32_0z24: ; SSE3: # %bb.0: -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE3-NEXT: xorps %xmm2, %xmm2 -; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] +; SSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] ; SSE3-NEXT: movaps %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4f32_0z24: ; SSSE3: # %bb.0: -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSSE3-NEXT: xorps %xmm2, %xmm2 -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] +; SSSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] ; SSSE3-NEXT: movaps %xmm2, %xmm0 ; SSSE3-NEXT: retq @@ -1341,22 +1351,22 @@ define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) { ; SSE2-LABEL: shuffle_v4i32_z6zz: ; SSE2: # %bb.0: +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_z6zz: ; SSE3: # %bb.0: +; SSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_z6zz: ; SSSE3: # %bb.0: +; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSSE3-NEXT: retq ; @@ -1559,13 +1569,13 @@ define <4 x i32> @shuffle_v4i32_2456(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: shuffle_v4i32_2456: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_2456: ; SSE3: # %bb.0: -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] ; SSE3-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1261,7 +1261,7 @@ ; SSE2-LABEL: shuffle_v8i16_032dXXXX: ; SSE2: # %bb.0: ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] @@ -1571,7 +1571,7 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8i16_XXX1X579: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] @@ -1579,7 +1579,7 @@ ; ; AVX2-FAST-LABEL: shuffle_v8i16_XXX1X579: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] ; AVX2-FAST-NEXT: retq @@ -1611,21 +1611,21 @@ ; ; SSE41-LABEL: shuffle_v8i16_XX4X8acX: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5] +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,4,5] ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: shuffle_v8i16_XX4X8acX: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,4,5] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i16_XX4X8acX: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,4,5] ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: retq @@ -2546,19 +2546,19 @@ define <8 x i16> @shuffle_v8i16_9zzzuuuu(<8 x i16> %x) { ; SSE-LABEL: shuffle_v8i16_9zzzuuuu: ; SSE: # %bb.0: -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: retq ; ; AVX1-LABEL: shuffle_v8i16_9zzzuuuu: ; AVX1: # %bb.0: -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i16_9zzzuuuu: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: retq ; @@ -2569,7 +2569,7 @@ ; ; AVX512VL-SLOW-LABEL: shuffle_v8i16_9zzzuuuu: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX512VL-SLOW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3] ; AVX512VL-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512VL-SLOW-NEXT: retq ; @@ -2776,7 +2776,7 @@ ; SSE2-LABEL: insert_dup_elt3_mem_v8i16_i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,0,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; @@ -2854,7 +2854,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movswl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,0,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -2255,7 +2255,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,14,15] ; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -2263,7 +2263,7 @@ ; AVX2-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq @@ -3229,8 +3229,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,14,15,8,9,12,13,14,15] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -3267,8 +3266,8 @@ ; AVX2-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] ; AVX2-NEXT: retq ; @@ -3287,8 +3286,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,4,5,4,5,0,1,4,5,8,9,14,15] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -3327,7 +3325,7 @@ ; AVX2-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7,24,25,16,17,26,27,18,19,28,29,20,21,30,31,22,23] ; AVX2-NEXT: retq @@ -3372,7 +3370,7 @@ ; AVX2-FAST-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,2,3,8,9,12,13,12,13,14,15,16,17,16,17,20,21,18,19,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,2,3,8,9,12,13,u,u,u,u,16,17,16,17,20,21,18,19,24,25,28,29,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,2,3,6,7,12,13,10,11,14,15,14,15,16,17,18,19,18,19,22,23,28,29,26,27,30,31,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] @@ -3396,7 +3394,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,0,1,2,3,2,3,0,1,12,13,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,2,3] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 @@ -3459,7 +3457,7 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7,8,9],ymm2[10],ymm1[11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,0,1,2,3,0,1,8,9,10,11,6,7,4,5,18,19,16,17,18,19,16,17,24,25,26,27,22,23,20,21] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,12,13,14,15,18,19,16,17,22,23,20,21,22,23,20,21,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,u,u,u,u,6,7,4,5,u,u,u,u,18,19,16,17,u,u,u,u,22,23,20,21,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: retq ; @@ -3478,8 +3476,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,14,15,8,9,12,13,14,15] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -3507,8 +3504,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,4,5,4,5,0,1,4,5,8,9,14,15] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -3550,17 +3546,17 @@ ; AVX2-SLOW-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15,16,17,20,21,18,19,22,23,24,25,28,29,26,27,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] ; AVX2-FAST-NEXT: retq ; @@ -3743,9 +3739,9 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11: ; AVX2: # %bb.0: -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7,16,17,18,19,20,21,26,27,24,25,26,27,28,29,22,23] ; AVX2-NEXT: retq ; @@ -3830,12 +3826,11 @@ ; AVX1-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,4,5] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -3843,7 +3838,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5,16,17,20,21,20,21,22,23,16,17,20,21,24,25,20,21] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,4,5,u,u,u,u,u,u,u,u,16,17,20,21,24,25,20,21] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -130,7 +130,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1,0,1] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] ; AVX1-NEXT: retq ; @@ -369,7 +369,7 @@ define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_08991abb: ; AVX1: # %bb.0: -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0],xmm1[0,0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3] @@ -1430,7 +1430,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1,0,1] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] ; AVX1-NEXT: retq ; @@ -1698,7 +1698,7 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_08991abb: ; AVX1: # %bb.0: -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0],xmm1[0,0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3] @@ -1736,8 +1736,8 @@ define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_091b2d3f: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -309,7 +309,7 @@ ; ALL: # %bb.0: ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 -; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; ALL-NEXT: vbroadcastss %xmm1, %xmm1 ; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -762,20 +762,23 @@ define i32 @mask_zzz3_v16i8(<16 x i8> %a0) { ; SSSE3-LABEL: mask_zzz3_v16i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[14,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: andl $-16777216, %eax # imm = 0xFF000000 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: mask_zzz3_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[14] +; SSE41-NEXT: psllw $8, %xmm0 ; SSE41-NEXT: pextrd $3, %xmm0, %eax +; SSE41-NEXT: andl $-16777216, %eax # imm = 0xFF000000 ; SSE41-NEXT: retq ; ; AVX-LABEL: mask_zzz3_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[14] +; AVX-NEXT: vpsllw $8, %xmm0, %xmm0 ; AVX-NEXT: vpextrd $3, %xmm0, %eax +; AVX-NEXT: andl $-16777216, %eax # imm = 0xFF000000 ; AVX-NEXT: retq %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %2 = bitcast <16 x i8> %1 to <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -864,12 +864,19 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test15: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test15: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test15: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -2526,13 +2533,13 @@ define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_insertps4: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_insertps4: ; SSSE3: # %bb.0: -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSSE3-NEXT: retq ; @@ -2743,7 +2750,7 @@ ; SSE2-LABEL: PR30264: ; SSE2: # %bb.0: ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -2751,7 +2758,7 @@ ; SSSE3-LABEL: PR30264: ; SSSE3: # %bb.0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -117,9 +117,11 @@ ; SSE41-NEXT: andl $3, %edx ; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE41-NEXT: andl $3, %ecx +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE41-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; SSE41-NEXT: retq ; @@ -135,8 +137,10 @@ ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: andl $3, %ecx ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; AVX-NEXT: retq %x0 = extractelement <4 x float> %x, i32 %i0 @@ -1292,8 +1296,8 @@ ; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16: @@ -1329,8 +1333,8 @@ ; SSSE3-NEXT: movd %eax, %xmm2 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll @@ -201,12 +201,16 @@ ; ALL-NEXT: vmovaps %ymm0, (%rsp) ; ALL-NEXT: andl $7, %r9d ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; ALL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; ALL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; ALL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; ALL-NEXT: movq %rbp, %rsp @@ -252,12 +256,16 @@ ; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; ALL-NEXT: andl $3, %r9d ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; ALL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; ALL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; ALL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; ALL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-sqrt.ll b/llvm/test/CodeGen/X86/vector-sqrt.ll --- a/llvm/test/CodeGen/X86/vector-sqrt.ll +++ b/llvm/test/CodeGen/X86/vector-sqrt.ll @@ -33,10 +33,10 @@ ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -157,7 +157,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -461,9 +461,8 @@ ; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -622,9 +621,8 @@ ; AVX1-LABEL: trunc_add_const_v8i32_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper @@ -1015,7 +1013,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -1448,9 +1446,8 @@ ; AVX1-LABEL: trunc_sub_const_v8i32_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper @@ -1946,7 +1943,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -2312,9 +2309,8 @@ ; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2474,9 +2470,8 @@ ; AVX1-LABEL: trunc_mul_const_v8i32_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper @@ -2923,9 +2918,8 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -3317,9 +3311,8 @@ ; AVX1-LABEL: trunc_and_const_v8i32_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper @@ -3700,9 +3693,8 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -4094,9 +4086,8 @@ ; AVX1-LABEL: trunc_xor_const_v8i32_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper @@ -4477,9 +4468,8 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -4871,9 +4861,8 @@ ; AVX1-LABEL: trunc_or_const_v8i32_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -128,12 +128,10 @@ ; ; AVX2-FAST-LABEL: trunc8i64_8i32_lshr: ; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32_lshr: @@ -353,9 +351,8 @@ ; AVX1-LABEL: trunc8i32_8i16: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2052,13 +2049,14 @@ ; AVX1-LABEL: store_merge_split: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: shlq $4, %rdi ; AVX1-NEXT: vmovdqu %xmm0, (%rsi,%rdi) diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -1754,7 +1754,7 @@ ; SSE2-LABEL: shuf_zext_16i8_to_4i64_offset11: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; SSE2-NEXT: psrlq $8, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE2-NEXT: movdqa %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -569,7 +569,7 @@ ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vshift-4.ll b/llvm/test/CodeGen/X86/vshift-4.ll --- a/llvm/test/CodeGen/X86/vshift-4.ll +++ b/llvm/test/CodeGen/X86/vshift-4.ll @@ -58,19 +58,15 @@ ; X32-LABEL: shift2a: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X32-NEXT: pslld %xmm2, %xmm0 +; X32-NEXT: psrlq $32, %xmm1 +; X32-NEXT: pslld %xmm1, %xmm0 ; X32-NEXT: movdqa %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: shift2a: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X64-NEXT: pslld %xmm2, %xmm0 +; X64-NEXT: psrlq $32, %xmm1 +; X64-NEXT: pslld %xmm1, %xmm0 ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq entry: @@ -84,19 +80,15 @@ ; X32-LABEL: shift2b: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X32-NEXT: pslld %xmm2, %xmm0 +; X32-NEXT: psrlq $32, %xmm1 +; X32-NEXT: pslld %xmm1, %xmm0 ; X32-NEXT: movdqa %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: shift2b: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X64-NEXT: pslld %xmm2, %xmm0 +; X64-NEXT: psrlq $32, %xmm1 +; X64-NEXT: pslld %xmm1, %xmm0 ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq entry: @@ -110,19 +102,15 @@ ; X32-LABEL: shift2c: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X32-NEXT: pslld %xmm2, %xmm0 +; X32-NEXT: psrlq $32, %xmm1 +; X32-NEXT: pslld %xmm1, %xmm0 ; X32-NEXT: movdqa %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: shift2c: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X64-NEXT: pslld %xmm2, %xmm0 +; X64-NEXT: psrlq $32, %xmm1 +; X64-NEXT: pslld %xmm1, %xmm0 ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq entry: @@ -136,8 +124,7 @@ ; X32-LABEL: shift3a: ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; X32-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X32-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] ; X32-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X32-NEXT: psllw %xmm1, %xmm0 ; X32-NEXT: movdqa %xmm0, (%eax) @@ -145,8 +132,7 @@ ; ; X64-LABEL: shift3a: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; X64-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; X64-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] ; X64-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: psllw %xmm1, %xmm0 ; X64-NEXT: movdqa %xmm0, (%rdi)