Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1340,6 +1340,74 @@ KnownUndef.setHighBits(NumElts - 1); break; } + case ISD::BITCAST: { + SDValue Src = Op.getOperand(0); + EVT SrcVT = Src.getValueType(); + + // We only handle vectors here. + // TODO - investigate calling SimplifyDemandedBits/ComputeKnownBits? + if (!SrcVT.isVector()) + break; + + // Fast handling of 'identity' bitcasts. + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + if (NumSrcElts == NumElts) + return SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, + KnownZero, TLO, Depth + 1); + + APInt SrcZero, SrcUndef; + APInt SrcDemandedElts = APInt::getNullValue(NumSrcElts); + + // Bitcast from 'large element' src vector to 'small element' vector, we + // must demand a source element if any DemandedElt maps to it. + if ((NumElts % NumSrcElts) == 0) { + unsigned Scale = NumElts / NumSrcElts; + for (unsigned i = 0; i != NumElts; ++i) + if (DemandedElts[i]) + SrcDemandedElts.setBit(i / Scale); + + if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero, + TLO, Depth + 1)) + return true; + + // If the src element is zero/undef then all the output elements will be - + // only demanded elements are guaranteed to be correct. + for (unsigned i = 0; i != NumSrcElts; ++i) { + if (SrcDemandedElts[i]) { + if (SrcZero[i]) + KnownZero.setBits(i * Scale, (i + 1) * Scale); + if (SrcUndef[i]) + KnownUndef.setBits(i * Scale, (i + 1) * Scale); + } + } + } + + // Bitcast from 'small element' src vector to 'large element' vector, we + // demand all smaller source elements covered by the larger demanded element + // of this vector. + if ((NumSrcElts % NumElts) == 0) { + unsigned Scale = NumSrcElts / NumElts; + for (unsigned i = 0; i != NumElts; ++i) + if (DemandedElts[i]) + SrcDemandedElts.setBits(i * Scale, (i + 1) * Scale); + + if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero, + TLO, Depth + 1)) + return true; + + // If all the src elements covering an output element are zero/undef, then + // the output element will be as well, assuming it was demanded. + for (unsigned i = 0; i != NumElts; ++i) { + if (DemandedElts[i]) { + if (SrcZero.extractBits(Scale, i * Scale).isAllOnesValue()) + KnownZero.setBit(i); + if (SrcUndef.extractBits(Scale, i * Scale).isAllOnesValue()) + KnownUndef.setBit(i); + } + } + } + break; + } case ISD::BUILD_VECTOR: { // Check all elements and simplify any unused elements with UNDEF. if (!DemandedElts.isAllOnesValue()) { Index: test/CodeGen/X86/avg.ll =================================================================== --- test/CodeGen/X86/avg.ll +++ test/CodeGen/X86/avg.ll @@ -2040,16 +2040,17 @@ ; SSE2-NEXT: movaps (%rsi), %xmm0 ; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: movq %rbp, -{{[0-9]+}}(%rsp) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp ; SSE2-NEXT: movq %rbp, -{{[0-9]+}}(%rsp) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp @@ -2061,168 +2062,163 @@ ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp ; SSE2-NEXT: movq %rbp, -{{[0-9]+}}(%rsp) # 8-byte Spill ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d -; SSE2-NEXT: addq %rax, %r11 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: addq %rdi, %rax -; SSE2-NEXT: movq %rax, %rdi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d -; SSE2-NEXT: addq %r15, %r14 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: addq %rax, %rbp ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: addq %rsi, %rax -; SSE2-NEXT: movq %rax, %r15 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: addq %rdx, %rsi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSE2-NEXT: addq %r13, %r8 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: addq %r10, %rax -; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: movq %rax, %rsi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d +; SSE2-NEXT: addq %r13, %r10 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d +; SSE2-NEXT: addq %rcx, %r12 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d +; SSE2-NEXT: addq %rdx, %r11 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: addq %rcx, %rax +; SSE2-NEXT: addq %rdi, %rax ; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d -; SSE2-NEXT: addq %r9, %r13 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: addq %rbx, %rcx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: addq %r12, %rax -; SSE2-NEXT: movq %rax, %r9 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: addq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload -; SSE2-NEXT: movq %rax, %rbp -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: addq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload -; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill +; SSE2-NEXT: addq %r14, %r13 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d +; SSE2-NEXT: addq %r9, %r15 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d +; SSE2-NEXT: addq %r8, %r14 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d +; SSE2-NEXT: addq %rbx, %r8 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: addq -{{[0-9]+}}(%rsp), %r9 # 8-byte Folded Reload ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: addq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload -; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill +; SSE2-NEXT: movq %rax, %rbx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: addq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload ; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: addq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload ; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill -; SSE2-NEXT: addq $-1, %r11 -; SSE2-NEXT: movq %r11, {{[0-9]+}}(%rsp) # 8-byte Spill -; SSE2-NEXT: movl $0, %r12d -; SSE2-NEXT: adcq $-1, %r12 -; SSE2-NEXT: addq $-1, %rdi -; SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill -; SSE2-NEXT: movl $0, %edx -; SSE2-NEXT: adcq $-1, %rdx -; SSE2-NEXT: addq $-1, %r14 -; SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: addq -{{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: addq -{{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload +; SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill +; SSE2-NEXT: addq $-1, %rbp +; SSE2-NEXT: movq %rbp, {{[0-9]+}}(%rsp) # 8-byte Spill ; SSE2-NEXT: movl $0, %edi ; SSE2-NEXT: adcq $-1, %rdi -; SSE2-NEXT: addq $-1, %r15 -; SSE2-NEXT: movq %r15, -{{[0-9]+}}(%rsp) # 8-byte Spill -; SSE2-NEXT: movl $0, %eax -; SSE2-NEXT: adcq $-1, %rax ; SSE2-NEXT: addq $-1, %rsi -; SSE2-NEXT: movq %rsi, (%rsp) # 8-byte Spill -; SSE2-NEXT: movl $0, %r15d -; SSE2-NEXT: adcq $-1, %r15 -; SSE2-NEXT: addq $-1, %r8 -; SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) # 8-byte Spill -; SSE2-NEXT: movl $0, %r14d -; SSE2-NEXT: adcq $-1, %r14 -; SSE2-NEXT: addq $-1, %r10 -; SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) # 8-byte Spill +; SSE2-NEXT: movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill ; SSE2-NEXT: movl $0, %esi ; SSE2-NEXT: adcq $-1, %rsi -; SSE2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill -; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %r10 # 8-byte Reload ; SSE2-NEXT: addq $-1, %r10 -; SSE2-NEXT: movl $0, %esi -; SSE2-NEXT: adcq $-1, %rsi -; SSE2-NEXT: movq %rsi, %r8 +; SSE2-NEXT: movq %r10, {{[0-9]+}}(%rsp) # 8-byte Spill +; SSE2-NEXT: movl $0, %r10d +; SSE2-NEXT: adcq $-1, %r10 +; SSE2-NEXT: addq $-1, %r12 +; SSE2-NEXT: movq %r12, {{[0-9]+}}(%rsp) # 8-byte Spill +; SSE2-NEXT: movl $0, %ecx +; SSE2-NEXT: adcq $-1, %rcx +; SSE2-NEXT: addq $-1, %r11 +; SSE2-NEXT: movq %r11, (%rsp) # 8-byte Spill +; SSE2-NEXT: movl $0, %r11d +; SSE2-NEXT: adcq $-1, %r11 +; SSE2-NEXT: addq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill +; SSE2-NEXT: movl $0, %r12d +; SSE2-NEXT: adcq $-1, %r12 ; SSE2-NEXT: addq $-1, %r13 -; SSE2-NEXT: movl $0, %esi -; SSE2-NEXT: adcq $-1, %rsi -; SSE2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill -; SSE2-NEXT: addq $-1, %rcx -; SSE2-NEXT: movl $0, %esi -; SSE2-NEXT: adcq $-1, %rsi +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: adcq $-1, %rax +; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill +; SSE2-NEXT: addq $-1, %r15 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: adcq $-1, %rax +; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill +; SSE2-NEXT: addq $-1, %r14 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: adcq $-1, %rax +; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill +; SSE2-NEXT: addq $-1, %r8 +; SSE2-NEXT: movl $0, %ebp +; SSE2-NEXT: adcq $-1, %rbp ; SSE2-NEXT: addq $-1, %r9 ; SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp) # 8-byte Spill ; SSE2-NEXT: movl $0, %r9d ; SSE2-NEXT: adcq $-1, %r9 -; SSE2-NEXT: addq $-1, %rbp -; SSE2-NEXT: movq %rbp, {{[0-9]+}}(%rsp) # 8-byte Spill -; SSE2-NEXT: movl $0, %r11d -; SSE2-NEXT: adcq $-1, %r11 -; SSE2-NEXT: addq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill +; SSE2-NEXT: addq $-1, %rbx +; SSE2-NEXT: movq %rbx, {{[0-9]+}}(%rsp) # 8-byte Spill ; SSE2-NEXT: movl $0, %ebx ; SSE2-NEXT: adcq $-1, %rbx -; SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) # 8-byte Spill ; SSE2-NEXT: addq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill -; SSE2-NEXT: movl $0, %ebp -; SSE2-NEXT: adcq $-1, %rbp +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: adcq $-1, %rax +; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill ; SSE2-NEXT: addq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill -; SSE2-NEXT: movl $0, %ebx -; SSE2-NEXT: adcq $-1, %rbx -; SSE2-NEXT: movq %rbx, {{[0-9]+}}(%rsp) # 8-byte Spill +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: adcq $-1, %rax +; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill +; SSE2-NEXT: addq $-1, %rdx +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: adcq $-1, %rax ; SSE2-NEXT: addq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill ; SSE2-NEXT: adcq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill -; SSE2-NEXT: shldq $63, %rcx, %rsi -; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload -; SSE2-NEXT: shldq $63, %r13, %rbx -; SSE2-NEXT: shldq $63, %r10, %r8 -; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %r10 # 8-byte Reload -; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload -; SSE2-NEXT: shldq $63, %rcx, %r10 -; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload -; SSE2-NEXT: shldq $63, %rcx, %r14 -; SSE2-NEXT: movq (%rsp), %rcx # 8-byte Reload -; SSE2-NEXT: shldq $63, %rcx, %r15 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload -; SSE2-NEXT: shldq $63, %rcx, %rdi -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload -; SSE2-NEXT: shldq $63, %rcx, %rdx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload -; SSE2-NEXT: shldq $63, %rcx, %r12 -; SSE2-NEXT: movq %r12, %xmm11 -; SSE2-NEXT: movq %rdx, %xmm5 -; SSE2-NEXT: movq %rdi, %xmm13 -; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload -; SSE2-NEXT: shrdq $1, %rax, %rcx -; SSE2-NEXT: movq %rcx, %xmm15 -; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm8 -; SSE2-NEXT: movq %r15, %xmm9 -; SSE2-NEXT: movq %r14, %xmm6 -; SSE2-NEXT: movq %r10, %xmm7 -; SSE2-NEXT: movq %r8, %xmm0 +; SSE2-NEXT: shldq $63, %rdx, %rax +; SSE2-NEXT: movq %rax, %rdx +; SSE2-NEXT: shldq $63, %r8, %rbp +; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %r8 # 8-byte Reload +; SSE2-NEXT: shldq $63, %r14, %r8 +; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %r14 # 8-byte Reload +; SSE2-NEXT: shldq $63, %r15, %r14 +; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %r15 # 8-byte Reload +; SSE2-NEXT: shldq $63, %r13, %r15 +; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload +; SSE2-NEXT: shldq $63, %rax, %r12 +; SSE2-NEXT: movq (%rsp), %rax # 8-byte Reload +; SSE2-NEXT: shldq $63, %rax, %r11 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; SSE2-NEXT: shldq $63, %rax, %r10 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; SSE2-NEXT: shldq $63, %rax, %rsi +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; SSE2-NEXT: shldq $63, %rax, %rdi +; SSE2-NEXT: movq %rdi, %xmm11 +; SSE2-NEXT: movq %rsi, %xmm5 +; SSE2-NEXT: movq %r10, %xmm13 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; SSE2-NEXT: shrdq $1, %rcx, %rax +; SSE2-NEXT: movq %rax, %xmm15 +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: movq %rcx, %xmm8 +; SSE2-NEXT: movq %r11, %xmm9 +; SSE2-NEXT: movq %r12, %xmm6 +; SSE2-NEXT: movq %r15, %xmm7 +; SSE2-NEXT: movq %r14, %xmm0 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movq %rbx, %xmm10 -; SSE2-NEXT: movq %rsi, %xmm4 +; SSE2-NEXT: movq %r8, %xmm10 +; SSE2-NEXT: movq %rbp, %xmm4 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload ; SSE2-NEXT: shrdq $1, %r9, %rax ; SSE2-NEXT: movq %rax, %xmm1 ; SSE2-NEXT: shrq %r9 ; SSE2-NEXT: movq %r9, %xmm12 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload -; SSE2-NEXT: shrdq $1, %r11, %rax +; SSE2-NEXT: shrdq $1, %rbx, %rax ; SSE2-NEXT: movq %rax, %xmm2 -; SSE2-NEXT: shrq %r11 -; SSE2-NEXT: movq %r11, %xmm14 +; SSE2-NEXT: shrq %rbx +; SSE2-NEXT: movq %rbx, %xmm14 ; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload ; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload ; SSE2-NEXT: shrdq $1, %rcx, %rax ; SSE2-NEXT: movq %rax, %xmm3 -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: shrq %rax +; SSE2-NEXT: movq %rcx, %rbp +; SSE2-NEXT: shrq %rbp ; SSE2-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0] ; SSE2-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1] ; SSE2-NEXT: pand {{.*}}(%rip), %xmm5 ; SSE2-NEXT: por %xmm11, %xmm5 -; SSE2-NEXT: movq %rax, %xmm11 +; SSE2-NEXT: movq %rbp, %xmm11 ; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload -; SSE2-NEXT: movq %rbp, %rcx -; SSE2-NEXT: shrdq $1, %rbp, %rax +; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload +; SSE2-NEXT: shrdq $1, %rcx, %rax ; SSE2-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm8[0] ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] @@ -2236,17 +2232,16 @@ ; SSE2-NEXT: pand %xmm13, %xmm0 ; SSE2-NEXT: pandn %xmm5, %xmm13 ; SSE2-NEXT: movq %rcx, %xmm15 -; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload -; SSE2-NEXT: shrdq $1, %rcx, %rax ; SSE2-NEXT: por %xmm0, %xmm13 ; SSE2-NEXT: pslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4] ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] ; SSE2-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5] ; SSE2-NEXT: pand %xmm0, %xmm6 ; SSE2-NEXT: pandn %xmm9, %xmm0 -; SSE2-NEXT: movq %rax, %xmm9 -; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: movq %rdx, %xmm9 +; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload +; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload +; SSE2-NEXT: shrdq $1, %rax, %rcx ; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5,6] ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] @@ -2255,31 +2250,27 @@ ; SSE2-NEXT: pand %xmm6, %xmm5 ; SSE2-NEXT: pandn %xmm7, %xmm6 ; SSE2-NEXT: movq %rcx, %xmm7 -; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload -; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload -; SSE2-NEXT: shrdq $1, %rax, %rcx +; SSE2-NEXT: shrq %rax ; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE2-NEXT: pand %xmm5, %xmm6 ; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: movq %rcx, %xmm0 -; SSE2-NEXT: shrq %rax +; SSE2-NEXT: movq %rax, %xmm0 ; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: movq %rax, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm13[2],xmm5[3],xmm13[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm12[0] ; SSE2-NEXT: pslld $24, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm14[0] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: pand %xmm10, %xmm2 -; SSE2-NEXT: pandn %xmm1, %xmm10 -; SSE2-NEXT: por %xmm2, %xmm10 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: por %xmm2, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pandn %xmm10, %xmm1 +; SSE2-NEXT: pandn %xmm6, %xmm1 ; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm11[0] ; SSE2-NEXT: psllq $56, %xmm3 @@ -2290,18 +2281,17 @@ ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm8, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm7[0] -; SSE2-NEXT: psllq $40, %xmm9 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm9, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,0,1,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE2-NEXT: psllq $40, %xmm7 +; SSE2-NEXT: pandn %xmm7, %xmm6 +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm6 ; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3] +; SSE2-NEXT: por %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] ; SSE2-NEXT: movupd %xmm5, (%rax) Index: test/CodeGen/X86/mmx-build-vector.ll =================================================================== --- test/CodeGen/X86/mmx-build-vector.ll +++ test/CodeGen/X86/mmx-build-vector.ll @@ -637,42 +637,28 @@ ; X86-SSE-NEXT: popl %ebp ; X86-SSE-NEXT: retl ; -; X64-SSE2-LABEL: build_v4i16_012u: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movd %edx, %xmm0 -; X64-SSE2-NEXT: movd %esi, %xmm1 -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSE2-NEXT: movd %ecx, %xmm0 -; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] -; X64-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE2-NEXT: paddd %mm0, %mm0 -; X64-SSE2-NEXT: movq %mm0, (%rdi) -; X64-SSE2-NEXT: retq -; -; X64-SSSE3-LABEL: build_v4i16_012u: -; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movd %edx, %xmm0 -; X64-SSSE3-NEXT: movd %esi, %xmm1 -; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSSE3-NEXT: movd %ecx, %xmm0 -; X64-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; X64-SSSE3-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSSE3-NEXT: paddd %mm0, %mm0 -; X64-SSSE3-NEXT: movq %mm0, (%rdi) -; X64-SSSE3-NEXT: retq +; X64-SSE-LABEL: build_v4i16_012u: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movd %edx, %xmm0 +; X64-SSE-NEXT: movd %esi, %xmm1 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-SSE-NEXT: paddd %mm0, %mm0 +; X64-SSE-NEXT: movq %mm0, (%rdi) +; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: build_v4i16_012u: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovd %esi, %xmm0 ; X64-AVX-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 ; X64-AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; X64-AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 ; X64-AVX-NEXT: paddd %mm0, %mm0 @@ -762,7 +748,7 @@ ; X64-SSSE3-LABEL: build_v4i16_0u00: ; X64-SSSE3: # %bb.0: ; X64-SSSE3-NEXT: movd %esi, %xmm0 -; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,u,u,0,1,0,1,0,1,0,1,0,1,2,3] +; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,2,3] ; X64-SSSE3-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 ; X64-SSSE3-NEXT: paddd %mm0, %mm0 @@ -772,7 +758,7 @@ ; X64-AVX1-LABEL: build_v4i16_0u00: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovd %esi, %xmm0 -; X64-AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,u,u,0,1,0,1,0,1,0,1,0,1,2,3] +; X64-AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,2,3] ; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 ; X64-AVX1-NEXT: paddd %mm0, %mm0 @@ -783,7 +769,8 @@ ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovd %esi, %xmm0 ; X64-AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; X64-AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) ; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 ; X64-AVX2-NEXT: paddd %mm0, %mm0 @@ -794,7 +781,8 @@ ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovd %esi, %xmm0 ; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; X64-AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) ; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 ; X64-AVX512-NEXT: paddd %mm0, %mm0 @@ -1029,7 +1017,7 @@ ; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[4,6,8,10],zero,xmm0[14,u,u,u,u,u,u,u,u] +; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,u,4,6,8,10],zero,xmm0[14,u,u,u,u,u,u,u,u] ; X86-SSSE3-NEXT: movq %xmm0, (%esp) ; X86-SSSE3-NEXT: movq (%esp), %mm0 ; X86-SSSE3-NEXT: paddd %mm0, %mm0 @@ -1076,7 +1064,7 @@ ; X64-SSSE3-NEXT: movd %esi, %xmm1 ; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X64-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[4,6,8,10],zero,xmm1[14,u,u,u,u,u,u,u,u] +; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,u,4,6,8,10],zero,xmm1[14,u,u,u,u,u,u,u,u] ; X64-SSSE3-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) ; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 ; X64-SSSE3-NEXT: paddd %mm0, %mm0 @@ -1094,7 +1082,7 @@ ; X64-AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ; X64-AVX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; X64-AVX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[4,6,8,10],zero,xmm0[14,u,u,u,u,u,u,u,u] +; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,4,6,8,10],zero,xmm0[14,u,u,u,u,u,u,u,u] ; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 ; X64-AVX-NEXT: paddd %mm0, %mm0 @@ -1186,7 +1174,7 @@ ; X86-SSSE3-NEXT: pinsrw $2, %ecx, %xmm0 ; X86-SSSE3-NEXT: movl 24(%ebp), %ecx ; X86-SSSE3-NEXT: pinsrw $3, %ecx, %xmm0 -; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u] ; X86-SSSE3-NEXT: movq %xmm0, (%esp) ; X86-SSSE3-NEXT: movq (%esp), %mm0 ; X86-SSSE3-NEXT: paddd %mm0, %mm0 @@ -1217,7 +1205,7 @@ ; X64-SSSE3-NEXT: pinsrw $1, %edx, %xmm0 ; X64-SSSE3-NEXT: pinsrw $2, %ecx, %xmm0 ; X64-SSSE3-NEXT: pinsrw $3, %r8d, %xmm0 -; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u] ; X64-SSSE3-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 ; X64-SSSE3-NEXT: paddd %mm0, %mm0 @@ -1231,7 +1219,7 @@ ; X64-AVX-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 ; X64-AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 ; X64-AVX-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u] ; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 ; X64-AVX-NEXT: paddd %mm0, %mm0 Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -325,7 +325,7 @@ ; SSE42-NEXT: pextrb $0, %xmm1, 6(%rdi) ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15] ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,u,u,u,u,u,u,u,u,u] ; SSE42-NEXT: pextrw $2, %xmm1, 4(%rdi) ; SSE42-NEXT: movd %xmm1, (%rdi) ; SSE42-NEXT: retq @@ -335,7 +335,7 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpextrb $0, %xmm1, 6(%rdi) ; AVX-NEXT: vpextrw $2, %xmm0, 4(%rdi) ; AVX-NEXT: vmovd %xmm0, (%rdi) @@ -343,7 +343,7 @@ ; ; XOP-LABEL: v7i8: ; XOP: # %bb.0: -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[8],xmm0[12],xmm1[8],xmm0[4],xmm1[12,0,14,u,u,u,u,u,u,u,u] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[8],xmm0[12],xmm1[8],xmm0[4],xmm1[12,0,u,u,u,u,u,u,u,u,u] ; XOP-NEXT: vpextrb $0, %xmm1, 6(%rdi) ; XOP-NEXT: vpextrw $2, %xmm0, 4(%rdi) ; XOP-NEXT: vmovd %xmm0, (%rdi) Index: test/CodeGen/X86/vec_insert-7.ll =================================================================== --- test/CodeGen/X86/vec_insert-7.ll +++ test/CodeGen/X86/vec_insert-7.ll @@ -18,13 +18,8 @@ ; ; X64-LABEL: mmx_movzl: ; X64: ## %bb.0: -; X64-NEXT: movdq2q %xmm0, %mm0 -; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; X64-NEXT: movl $32, %eax -; X64-NEXT: pinsrq $0, %rax, %xmm1 -; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; X64-NEXT: movq %rax, %xmm0 ; X64-NEXT: retq %tmp = bitcast x86_mmx %x to <2 x i32> %tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0 Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -844,18 +844,17 @@ ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE-NEXT: movapd {{.*#+}} xmm2 = [6.553600e+04,6.553600e+04] ; SSE-NEXT: mulpd %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: cvtdq2pd %xmm1, %xmm5 -; SSE-NEXT: mulpd %xmm2, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: cvtdq2pd %xmm4, %xmm1 -; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: cvtdq2pd %xmm1, %xmm4 +; SSE-NEXT: mulpd %xmm2, %xmm4 +; SSE-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE-NEXT: cvtdq2pd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm4, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: uitofp_4i32_to_4f64: @@ -2927,7 +2926,7 @@ ; SSE-LABEL: uitofp_load_2i32_to_2f64: ; SSE: # %bb.0: ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE-NEXT: psrld $16, %xmm0 @@ -2940,7 +2939,7 @@ ; VEX: # %bb.0: ; VEX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] ; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 ; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 ; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 @@ -3129,18 +3128,17 @@ ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE-NEXT: movapd {{.*#+}} xmm2 = [6.553600e+04,6.553600e+04] ; SSE-NEXT: mulpd %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: cvtdq2pd %xmm1, %xmm5 -; SSE-NEXT: mulpd %xmm2, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: cvtdq2pd %xmm4, %xmm1 -; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: cvtdq2pd %xmm1, %xmm4 +; SSE-NEXT: mulpd %xmm2, %xmm4 +; SSE-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE-NEXT: cvtdq2pd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm4, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: uitofp_load_4i32_to_4f64: Index: test/CodeGen/X86/vector-half-conversions.ll =================================================================== --- test/CodeGen/X86/vector-half-conversions.ll +++ test/CodeGen/X86/vector-half-conversions.ll @@ -2115,7 +2115,38 @@ } define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind { -; AVX1-LABEL: cvt_4f32_to_8i16_undef: +; ALL-LABEL: cvt_4f32_to_8i16_undef: +; ALL: # %bb.0: +; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: shll $16, %eax +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 +; ALL-NEXT: vmovd %xmm1, %ecx +; ALL-NEXT: movzwl %cx, %ecx +; ALL-NEXT: orl %eax, %ecx +; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: shll $16, %eax +; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %edx +; ALL-NEXT: movzwl %dx, %edx +; ALL-NEXT: orl %eax, %edx +; ALL-NEXT: shlq $32, %rdx +; ALL-NEXT: orq %rcx, %rdx +; ALL-NEXT: vmovq %rdx, %xmm0 +; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; ALL-NEXT: retq + %1 = fptrunc <4 x float> %a0 to <4 x half> + %2 = bitcast <4 x half> %1 to <4 x i16> + %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> + ret <8 x i16> %3 +} + +define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind { +; AVX1-LABEL: cvt_4f32_to_8i16_zero: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 @@ -2138,34 +2169,61 @@ ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX1-NEXT: retq ; -; AVX2-LABEL: cvt_4f32_to_8i16_undef: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: shll $16, %eax -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %ecx -; AVX2-NEXT: movzwl %cx, %ecx -; AVX2-NEXT: orl %eax, %ecx -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: shll $16, %eax -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %edx -; AVX2-NEXT: movzwl %dx, %edx -; AVX2-NEXT: orl %eax, %edx -; AVX2-NEXT: shlq $32, %rdx -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm0 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: cvt_4f32_to_8i16_zero: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovd %xmm1, %eax +; AVX2-SLOW-NEXT: shll $16, %eax +; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm0, %xmm1 +; AVX2-SLOW-NEXT: vmovd %xmm1, %ecx +; AVX2-SLOW-NEXT: movzwl %cx, %ecx +; AVX2-SLOW-NEXT: orl %eax, %ecx +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovd %xmm1, %eax +; AVX2-SLOW-NEXT: shll $16, %eax +; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %edx +; AVX2-SLOW-NEXT: movzwl %dx, %edx +; AVX2-SLOW-NEXT: orl %eax, %edx +; AVX2-SLOW-NEXT: shlq $32, %rdx +; AVX2-SLOW-NEXT: orq %rcx, %rdx +; AVX2-SLOW-NEXT: vmovq %rdx, %xmm0 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: cvt_4f32_to_8i16_zero: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovd %xmm1, %eax +; AVX2-FAST-NEXT: shll $16, %eax +; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vmovd %xmm1, %ecx +; AVX2-FAST-NEXT: movzwl %cx, %ecx +; AVX2-FAST-NEXT: orl %eax, %ecx +; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovd %xmm1, %eax +; AVX2-FAST-NEXT: shll $16, %eax +; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %edx +; AVX2-FAST-NEXT: movzwl %dx, %edx +; AVX2-FAST-NEXT: orl %eax, %edx +; AVX2-FAST-NEXT: shlq $32, %rdx +; AVX2-FAST-NEXT: orq %rcx, %rdx +; AVX2-FAST-NEXT: vmovq %rdx, %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: retq ; -; AVX512F-LABEL: cvt_4f32_to_8i16_undef: +; AVX512F-LABEL: cvt_4f32_to_8i16_zero: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 @@ -2188,9 +2246,10 @@ ; AVX512F-NEXT: orq %rcx, %rdx ; AVX512F-NEXT: vmovq %rdx, %xmm0 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: cvt_4f32_to_8i16_undef: +; AVX512VL-LABEL: cvt_4f32_to_8i16_zero: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 @@ -2212,41 +2271,10 @@ ; AVX512VL-NEXT: shlq $32, %rdx ; AVX512VL-NEXT: orq %rcx, %rdx ; AVX512VL-NEXT: vmovq %rdx, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> - %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> - ret <8 x i16> %3 -} - -define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind { -; ALL-LABEL: cvt_4f32_to_8i16_zero: -; ALL: # %bb.0: -; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vmovd %xmm1, %eax -; ALL-NEXT: shll $16, %eax -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; ALL-NEXT: vmovd %xmm1, %ecx -; ALL-NEXT: movzwl %cx, %ecx -; ALL-NEXT: orl %eax, %ecx -; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vmovd %xmm1, %eax -; ALL-NEXT: shll $16, %eax -; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vmovd %xmm0, %edx -; ALL-NEXT: movzwl %dx, %edx -; ALL-NEXT: orl %eax, %edx -; ALL-NEXT: shlq $32, %rdx -; ALL-NEXT: orq %rcx, %rdx -; ALL-NEXT: vmovq %rdx, %xmm0 -; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; ALL-NEXT: retq - %1 = fptrunc <4 x float> %a0 to <4 x half> - %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %3 } @@ -2553,7 +2581,40 @@ } define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounwind { -; AVX1-LABEL: store_cvt_4f32_to_8i16_undef: +; ALL-LABEL: store_cvt_4f32_to_8i16_undef: +; ALL: # %bb.0: +; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: shll $16, %eax +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 +; ALL-NEXT: vmovd %xmm1, %ecx +; ALL-NEXT: movzwl %cx, %ecx +; ALL-NEXT: orl %eax, %ecx +; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: shll $16, %eax +; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %edx +; ALL-NEXT: movzwl %dx, %edx +; ALL-NEXT: orl %eax, %edx +; ALL-NEXT: shlq $32, %rdx +; ALL-NEXT: orq %rcx, %rdx +; ALL-NEXT: vmovq %rdx, %xmm0 +; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; ALL-NEXT: vmovdqa %xmm0, (%rdi) +; ALL-NEXT: retq + %1 = fptrunc <4 x float> %a0 to <4 x half> + %2 = bitcast <4 x half> %1 to <4 x i16> + %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> + store <8 x i16> %3, <8 x i16>* %a1 + ret void +} + +define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwind { +; AVX1-LABEL: store_cvt_4f32_to_8i16_zero: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 @@ -2576,36 +2637,64 @@ ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX1-NEXT: vmovdqa %xmm0, (%rdi) ; AVX1-NEXT: retq ; -; AVX2-LABEL: store_cvt_4f32_to_8i16_undef: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: shll $16, %eax -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %ecx -; AVX2-NEXT: movzwl %cx, %ecx -; AVX2-NEXT: orl %eax, %ecx -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: shll $16, %eax -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %edx -; AVX2-NEXT: movzwl %dx, %edx -; AVX2-NEXT: orl %eax, %edx -; AVX2-NEXT: shlq $32, %rdx -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm0 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa %xmm0, (%rdi) -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: store_cvt_4f32_to_8i16_zero: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovd %xmm1, %eax +; AVX2-SLOW-NEXT: shll $16, %eax +; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm0, %xmm1 +; AVX2-SLOW-NEXT: vmovd %xmm1, %ecx +; AVX2-SLOW-NEXT: movzwl %cx, %ecx +; AVX2-SLOW-NEXT: orl %eax, %ecx +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovd %xmm1, %eax +; AVX2-SLOW-NEXT: shll $16, %eax +; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %edx +; AVX2-SLOW-NEXT: movzwl %dx, %edx +; AVX2-SLOW-NEXT: orl %eax, %edx +; AVX2-SLOW-NEXT: shlq $32, %rdx +; AVX2-SLOW-NEXT: orq %rcx, %rdx +; AVX2-SLOW-NEXT: vmovq %rdx, %xmm0 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2-SLOW-NEXT: retq ; -; AVX512F-LABEL: store_cvt_4f32_to_8i16_undef: +; AVX2-FAST-LABEL: store_cvt_4f32_to_8i16_zero: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovd %xmm1, %eax +; AVX2-FAST-NEXT: shll $16, %eax +; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vmovd %xmm1, %ecx +; AVX2-FAST-NEXT: movzwl %cx, %ecx +; AVX2-FAST-NEXT: orl %eax, %ecx +; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovd %xmm1, %eax +; AVX2-FAST-NEXT: shll $16, %eax +; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %edx +; AVX2-FAST-NEXT: movzwl %dx, %edx +; AVX2-FAST-NEXT: orl %eax, %edx +; AVX2-FAST-NEXT: shlq $32, %rdx +; AVX2-FAST-NEXT: orq %rcx, %rdx +; AVX2-FAST-NEXT: vmovq %rdx, %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: store_cvt_4f32_to_8i16_zero: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 @@ -2628,10 +2717,11 @@ ; AVX512F-NEXT: orq %rcx, %rdx ; AVX512F-NEXT: vmovq %rdx, %xmm0 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vmovdqa %xmm0, (%rdi) ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: store_cvt_4f32_to_8i16_undef: +; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 @@ -2653,44 +2743,11 @@ ; AVX512VL-NEXT: shlq $32, %rdx ; AVX512VL-NEXT: orq %rcx, %rdx ; AVX512VL-NEXT: vmovq %rdx, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vmovdqa %xmm0, (%rdi) ; AVX512VL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> - %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> - store <8 x i16> %3, <8 x i16>* %a1 - ret void -} - -define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwind { -; ALL-LABEL: store_cvt_4f32_to_8i16_zero: -; ALL: # %bb.0: -; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vmovd %xmm1, %eax -; ALL-NEXT: shll $16, %eax -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; ALL-NEXT: vmovd %xmm1, %ecx -; ALL-NEXT: movzwl %cx, %ecx -; ALL-NEXT: orl %eax, %ecx -; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vmovd %xmm1, %eax -; ALL-NEXT: shll $16, %eax -; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vmovd %xmm0, %edx -; ALL-NEXT: movzwl %dx, %edx -; ALL-NEXT: orl %eax, %edx -; ALL-NEXT: shlq $32, %rdx -; ALL-NEXT: orq %rcx, %rdx -; ALL-NEXT: vmovq %rdx, %xmm0 -; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; ALL-NEXT: vmovdqa %xmm0, (%rdi) -; ALL-NEXT: retq - %1 = fptrunc <4 x float> %a0 to <4 x half> - %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> store <8 x i16> %3, <8 x i16>* %a1 ret void @@ -3175,7 +3232,167 @@ ; AVX2-NEXT: popq %r14 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: cvt_4f64_to_8i16_undef: +; AVX512-LABEL: cvt_4f64_to_8i16_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: shll $16, %ebx +; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movzwl %ax, %r14d +; AVX512-NEXT: orl %ebx, %r14d +; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: shll $16, %ebx +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movzwl %ax, %eax +; AVX512-NEXT: orl %ebx, %eax +; AVX512-NEXT: shlq $32, %rax +; AVX512-NEXT: orq %r14, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: retq + %1 = fptrunc <4 x double> %a0 to <4 x half> + %2 = bitcast <4 x half> %1 to <4 x i16> + %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> + ret <8 x i16> %3 +} + +define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { +; AVX1-LABEL: cvt_4f64_to_8i16_zero: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: shll $16, %ebx +; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movzwl %ax, %r14d +; AVX1-NEXT: orl %ebx, %r14d +; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: shll $16, %ebx +; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movzwl %ax, %eax +; AVX1-NEXT: orl %ebx, %eax +; AVX1-NEXT: shlq $32, %rax +; AVX1-NEXT: orq %r14, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: cvt_4f64_to_8i16_zero: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: pushq %r14 +; AVX2-SLOW-NEXT: pushq %rbx +; AVX2-SLOW-NEXT: subq $40, %rsp +; AVX2-SLOW-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: callq __truncdfhf2@PLT +; AVX2-SLOW-NEXT: movl %eax, %ebx +; AVX2-SLOW-NEXT: shll $16, %ebx +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: callq __truncdfhf2@PLT +; AVX2-SLOW-NEXT: movzwl %ax, %r14d +; AVX2-SLOW-NEXT: orl %ebx, %r14d +; AVX2-SLOW-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-SLOW-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: callq __truncdfhf2@PLT +; AVX2-SLOW-NEXT: movl %eax, %ebx +; AVX2-SLOW-NEXT: shll $16, %ebx +; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: callq __truncdfhf2@PLT +; AVX2-SLOW-NEXT: movzwl %ax, %eax +; AVX2-SLOW-NEXT: orl %ebx, %eax +; AVX2-SLOW-NEXT: shlq $32, %rax +; AVX2-SLOW-NEXT: orq %r14, %rax +; AVX2-SLOW-NEXT: vmovq %rax, %xmm0 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-SLOW-NEXT: addq $40, %rsp +; AVX2-SLOW-NEXT: popq %rbx +; AVX2-SLOW-NEXT: popq %r14 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: cvt_4f64_to_8i16_zero: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: pushq %r14 +; AVX2-FAST-NEXT: pushq %rbx +; AVX2-FAST-NEXT: subq $40, %rsp +; AVX2-FAST-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: callq __truncdfhf2@PLT +; AVX2-FAST-NEXT: movl %eax, %ebx +; AVX2-FAST-NEXT: shll $16, %ebx +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: callq __truncdfhf2@PLT +; AVX2-FAST-NEXT: movzwl %ax, %r14d +; AVX2-FAST-NEXT: orl %ebx, %r14d +; AVX2-FAST-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-FAST-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: callq __truncdfhf2@PLT +; AVX2-FAST-NEXT: movl %eax, %ebx +; AVX2-FAST-NEXT: shll $16, %ebx +; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: callq __truncdfhf2@PLT +; AVX2-FAST-NEXT: movzwl %ax, %eax +; AVX2-FAST-NEXT: orl %ebx, %eax +; AVX2-FAST-NEXT: shlq $32, %rax +; AVX2-FAST-NEXT: orq %r14, %rax +; AVX2-FAST-NEXT: vmovq %rax, %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: addq $40, %rsp +; AVX2-FAST-NEXT: popq %rbx +; AVX2-FAST-NEXT: popq %r14 +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: cvt_4f64_to_8i16_zero: ; AVX512F: # %bb.0: ; AVX512F-NEXT: pushq %r14 ; AVX512F-NEXT: pushq %rbx @@ -3208,12 +3425,13 @@ ; AVX512F-NEXT: orq %r14, %rax ; AVX512F-NEXT: vmovq %rax, %xmm0 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: addq $40, %rsp ; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: popq %r14 ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: cvt_4f64_to_8i16_undef: +; AVX512VL-LABEL: cvt_4f64_to_8i16_zero: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: pushq %r14 ; AVX512VL-NEXT: pushq %rbx @@ -3245,152 +3463,32 @@ ; AVX512VL-NEXT: shlq $32, %rax ; AVX512VL-NEXT: orq %r14, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: addq $40, %rsp ; AVX512VL-NEXT: popq %rbx ; AVX512VL-NEXT: popq %r14 ; AVX512VL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> - %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> + %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %3 } -define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { -; AVX1-LABEL: cvt_4f64_to_8i16_zero: +define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { +; AVX1-LABEL: cvt_8f64_to_8i16: ; AVX1: # %bb.0: +; AVX1-NEXT: pushq %r15 ; AVX1-NEXT: pushq %r14 ; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $40, %rsp -; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movzwl %ax, %r14d -; AVX1-NEXT: orl %ebx, %r14d -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: shll $16, %ebx -; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: orl %ebx, %eax -; AVX1-NEXT: shlq $32, %rax -; AVX1-NEXT: orq %r14, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: addq $40, %rsp -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_4f64_to_8i16_zero: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $40, %rsp -; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movzwl %ax, %r14d -; AVX2-NEXT: orl %ebx, %r14d -; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: orl %ebx, %eax -; AVX2-NEXT: shlq $32, %rax -; AVX2-NEXT: orq %r14, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: addq $40, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: retq -; -; AVX512-LABEL: cvt_4f64_to_8i16_zero: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $40, %rsp -; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: shll $16, %ebx -; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movzwl %ax, %r14d -; AVX512-NEXT: orl %ebx, %r14d -; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: shll $16, %ebx -; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movzwl %ax, %eax -; AVX512-NEXT: orl %ebx, %eax -; AVX512-NEXT: shlq $32, %rax -; AVX512-NEXT: orq %r14, %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: addq $40, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: retq - %1 = fptrunc <4 x double> %a0 to <4 x half> - %2 = bitcast <4 x half> %1 to <4 x i16> - %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> - ret <8 x i16> %3 -} - -define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { -; AVX1-LABEL: cvt_8f64_to_8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $64, %rsp -; AVX1-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: shll $16, %ebx -; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT @@ -3833,7 +3931,184 @@ ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512F-LABEL: store_cvt_4f64_to_8i16_undef: +; AVX512-LABEL: store_cvt_4f64_to_8i16_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $32, %rsp +; AVX512-NEXT: movq %rdi, %r14 +; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: shll $16, %ebp +; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movzwl %ax, %ebx +; AVX512-NEXT: orl %ebp, %ebx +; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: shll $16, %ebp +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movzwl %ax, %eax +; AVX512-NEXT: orl %ebp, %eax +; AVX512-NEXT: shlq $32, %rax +; AVX512-NEXT: orq %rbx, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vmovdqa %xmm0, (%r14) +; AVX512-NEXT: addq $32, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %1 = fptrunc <4 x double> %a0 to <4 x half> + %2 = bitcast <4 x half> %1 to <4 x i16> + %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> + store <8 x i16> %3, <8 x i16>* %a1 + ret void +} + +define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounwind { +; AVX1-LABEL: store_cvt_4f64_to_8i16_zero: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $32, %rsp +; AVX1-NEXT: movq %rdi, %r14 +; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: shll $16, %ebp +; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movzwl %ax, %ebx +; AVX1-NEXT: orl %ebp, %ebx +; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: shll $16, %ebp +; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movzwl %ax, %eax +; AVX1-NEXT: orl %ebp, %eax +; AVX1-NEXT: shlq $32, %rax +; AVX1-NEXT: orq %rbx, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: vmovdqa %xmm0, (%r14) +; AVX1-NEXT: addq $32, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: store_cvt_4f64_to_8i16_zero: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: pushq %rbp +; AVX2-SLOW-NEXT: pushq %r14 +; AVX2-SLOW-NEXT: pushq %rbx +; AVX2-SLOW-NEXT: subq $32, %rsp +; AVX2-SLOW-NEXT: movq %rdi, %r14 +; AVX2-SLOW-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: callq __truncdfhf2@PLT +; AVX2-SLOW-NEXT: movl %eax, %ebp +; AVX2-SLOW-NEXT: shll $16, %ebp +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: callq __truncdfhf2@PLT +; AVX2-SLOW-NEXT: movzwl %ax, %ebx +; AVX2-SLOW-NEXT: orl %ebp, %ebx +; AVX2-SLOW-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-SLOW-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: callq __truncdfhf2@PLT +; AVX2-SLOW-NEXT: movl %eax, %ebp +; AVX2-SLOW-NEXT: shll $16, %ebp +; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: callq __truncdfhf2@PLT +; AVX2-SLOW-NEXT: movzwl %ax, %eax +; AVX2-SLOW-NEXT: orl %ebp, %eax +; AVX2-SLOW-NEXT: shlq $32, %rax +; AVX2-SLOW-NEXT: orq %rbx, %rax +; AVX2-SLOW-NEXT: vmovq %rax, %xmm0 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%r14) +; AVX2-SLOW-NEXT: addq $32, %rsp +; AVX2-SLOW-NEXT: popq %rbx +; AVX2-SLOW-NEXT: popq %r14 +; AVX2-SLOW-NEXT: popq %rbp +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: store_cvt_4f64_to_8i16_zero: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: pushq %rbp +; AVX2-FAST-NEXT: pushq %r14 +; AVX2-FAST-NEXT: pushq %rbx +; AVX2-FAST-NEXT: subq $32, %rsp +; AVX2-FAST-NEXT: movq %rdi, %r14 +; AVX2-FAST-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: callq __truncdfhf2@PLT +; AVX2-FAST-NEXT: movl %eax, %ebp +; AVX2-FAST-NEXT: shll $16, %ebp +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: callq __truncdfhf2@PLT +; AVX2-FAST-NEXT: movzwl %ax, %ebx +; AVX2-FAST-NEXT: orl %ebp, %ebx +; AVX2-FAST-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-FAST-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: callq __truncdfhf2@PLT +; AVX2-FAST-NEXT: movl %eax, %ebp +; AVX2-FAST-NEXT: shll $16, %ebp +; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: callq __truncdfhf2@PLT +; AVX2-FAST-NEXT: movzwl %ax, %eax +; AVX2-FAST-NEXT: orl %ebp, %eax +; AVX2-FAST-NEXT: shlq $32, %rax +; AVX2-FAST-NEXT: orq %rbx, %rax +; AVX2-FAST-NEXT: vmovq %rax, %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %xmm0, (%r14) +; AVX2-FAST-NEXT: addq $32, %rsp +; AVX2-FAST-NEXT: popq %rbx +; AVX2-FAST-NEXT: popq %r14 +; AVX2-FAST-NEXT: popq %rbp +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: store_cvt_4f64_to_8i16_zero: ; AVX512F: # %bb.0: ; AVX512F-NEXT: pushq %rbp ; AVX512F-NEXT: pushq %r14 @@ -3868,6 +4143,7 @@ ; AVX512F-NEXT: orq %rbx, %rax ; AVX512F-NEXT: vmovq %rax, %xmm0 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vmovdqa %xmm0, (%r14) ; AVX512F-NEXT: addq $32, %rsp ; AVX512F-NEXT: popq %rbx @@ -3875,7 +4151,7 @@ ; AVX512F-NEXT: popq %rbp ; AVX512F-NEXT: retq ; -; AVX512VL-LABEL: store_cvt_4f64_to_8i16_undef: +; AVX512VL-LABEL: store_cvt_4f64_to_8i16_zero: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: pushq %rbp ; AVX512VL-NEXT: pushq %r14 @@ -3909,7 +4185,7 @@ ; AVX512VL-NEXT: shlq $32, %rax ; AVX512VL-NEXT: orq %rbx, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vmovdqa %xmm0, (%r14) ; AVX512VL-NEXT: addq $32, %rsp ; AVX512VL-NEXT: popq %rbx @@ -3918,139 +4194,6 @@ ; AVX512VL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> - %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> - store <8 x i16> %3, <8 x i16>* %a1 - ret void -} - -define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounwind { -; AVX1-LABEL: store_cvt_4f64_to_8i16_zero: -; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $32, %rsp -; AVX1-NEXT: movq %rdi, %r14 -; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: shll $16, %ebp -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movzwl %ax, %ebx -; AVX1-NEXT: orl %ebp, %ebx -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: shll $16, %ebp -; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: orl %ebp, %eax -; AVX1-NEXT: shlq $32, %rax -; AVX1-NEXT: orq %rbx, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm0, (%r14) -; AVX1-NEXT: addq $32, %rsp -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %rbp -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_cvt_4f64_to_8i16_zero: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $32, %rsp -; AVX2-NEXT: movq %rdi, %r14 -; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: shll $16, %ebp -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movzwl %ax, %ebx -; AVX2-NEXT: orl %ebp, %ebx -; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: shll $16, %ebp -; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: orl %ebp, %eax -; AVX2-NEXT: shlq $32, %rax -; AVX2-NEXT: orq %rbx, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vmovdqa %xmm0, (%r14) -; AVX2-NEXT: addq $32, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: retq -; -; AVX512-LABEL: store_cvt_4f64_to_8i16_zero: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $32, %rsp -; AVX512-NEXT: movq %rdi, %r14 -; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: shll $16, %ebp -; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movzwl %ax, %ebx -; AVX512-NEXT: orl %ebp, %ebx -; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: shll $16, %ebp -; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movzwl %ax, %eax -; AVX512-NEXT: orl %ebp, %eax -; AVX512-NEXT: shlq $32, %rax -; AVX512-NEXT: orq %rbx, %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vmovdqa %xmm0, (%r14) -; AVX512-NEXT: addq $32, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: retq - %1 = fptrunc <4 x double> %a0 to <4 x half> - %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> store <8 x i16> %3, <8 x i16>* %a1 ret void Index: test/CodeGen/X86/vector-shuffle-128-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v4.ll +++ test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1863,7 +1863,7 @@ ; ; AVX512VL-LABEL: shuffle_v4f32_bitcast_4401: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512VL-NEXT: retq %1 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> Index: test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v8.ll +++ test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -2536,33 +2536,20 @@ } define <8 x i16> @insert_dup_mem_v8i16_sext_i16(i16* %ptr) { -; SSE2-LABEL: insert_dup_mem_v8i16_sext_i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movswl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: insert_dup_mem_v8i16_sext_i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movswl (%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_dup_mem_v8i16_sext_i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movswl (%rdi), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; SSE41-NEXT: retq +; SSE-LABEL: insert_dup_mem_v8i16_sext_i16: +; SSE: # %bb.0: +; SSE-NEXT: movswl (%rdi), %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_mem_v8i16_sext_i16: ; AVX1: # %bb.0: ; AVX1-NEXT: movswl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_dup_mem_v8i16_sext_i16: @@ -2615,8 +2602,7 @@ ; SSE2-LABEL: insert_dup_elt3_mem_v8i16_i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,0,1,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; @@ -2650,33 +2636,20 @@ } define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(i16* %ptr) { -; SSE2-LABEL: insert_dup_elt1_mem_v8i16_sext_i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movswl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_sext_i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movswl (%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_dup_elt1_mem_v8i16_sext_i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movswl (%rdi), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] -; SSE41-NEXT: retq +; SSE-LABEL: insert_dup_elt1_mem_v8i16_sext_i16: +; SSE: # %bb.0: +; SSE-NEXT: movswl (%rdi), %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_elt1_mem_v8i16_sext_i16: ; AVX1: # %bb.0: ; AVX1-NEXT: movswl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_dup_elt1_mem_v8i16_sext_i16: @@ -2706,8 +2679,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movswl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,0,1,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: retq ; Index: test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v16.ll +++ test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -4604,7 +4604,8 @@ ; AVX1-LABEL: insert_dup_mem_v16i16_i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -4624,7 +4625,8 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: movswl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -4652,7 +4654,8 @@ ; AVX1-LABEL: insert_dup_elt1_mem_v16i16_i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ;