Index: llvm/lib/CodeGen/TwoAddressInstructionPass.cpp =================================================================== --- llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -118,6 +118,8 @@ // registers. e.g. r1 = move v1024. DenseMap DstRegMap; + void removeClobberedSrcRegMap(MachineInstr *MI); + bool isRevCopyChain(Register FromReg, Register ToReg, int Maxlen); bool noUseAfterLastDef(Register Reg, unsigned Dist, unsigned &LastDef); @@ -380,7 +382,8 @@ if (!MRI->hasOneNonDBGUse(Reg)) // None or more than one use. return nullptr; - MachineInstr &UseMI = *MRI->use_instr_nodbg_begin(Reg); + MachineOperand &UseOp = *MRI->use_nodbg_begin(Reg); + MachineInstr &UseMI = *UseOp.getParent(); if (UseMI.getParent() != MBB) return nullptr; Register SrcReg; @@ -394,6 +397,18 @@ IsDstPhys = DstReg.isPhysical(); return &UseMI; } + if (UseMI.isCommutable()) { + unsigned Src1 = TargetInstrInfo::CommuteAnyOperandIndex; + unsigned Src2 = UseMI.getOperandNo(&UseOp); + if (TII->findCommutedOpIndices(UseMI, Src1, Src2)) { + MachineOperand &MO = UseMI.getOperand(Src1); + if (MO.isReg() && MO.isUse() && + isTwoAddrUse(UseMI, MO.getReg(), DstReg)) { + IsDstPhys = DstReg.isPhysical(); + return &UseMI; + } + } + } return nullptr; } @@ -422,6 +437,76 @@ return TRI->regsOverlap(RegA, RegB); } +/// From RegMap remove entries mapped to a physical register which overlaps MO. +static void removeMapRegEntry(const MachineOperand &MO, + DenseMap &RegMap, + const TargetRegisterInfo *TRI) { + assert( + (MO.isReg() || MO.isRegMask()) && + "removeMapRegEntry must be called with a register or regmask operand."); + + SmallVector Srcs; + for (auto SI : RegMap) { + Register ToReg = SI.second; + if (ToReg.isVirtual()) + continue; + + if (MO.isReg()) { + Register Reg = MO.getReg(); + if (TRI->regsOverlap(ToReg, Reg)) + Srcs.push_back(SI.first); + } else if (MO.clobbersPhysReg(ToReg)) + Srcs.push_back(SI.first); + } + + for (auto SrcReg : Srcs) + RegMap.erase(SrcReg); +} + +/// If a physical register is clobbered, old entries mapped to it should be +/// deleted. For example +/// +/// %2:gr64 = COPY killed $rdx +/// MUL64r %3:gr64, implicit-def $rax, implicit-def $rdx +/// +/// After the MUL instruction, $rdx contains different value than in the COPY +/// instruction. So %2 should not map to $rdx after MUL. +void TwoAddressInstructionPass::removeClobberedSrcRegMap(MachineInstr *MI) { + if (MI->isCopy()) { + // If a virtual register is copied to its mapped physical register, it + // doesn't change the potential coalescing between them, so we don't remove + // entries mapped to the physical register. For example + // + // %100 = COPY $r8 + // ... + // $r8 = COPY %100 + // + // The first copy constructs SrcRegMap[%100] = $r8, the second copy doesn't + // destroy the content of $r8, and should not impact SrcRegMap. + Register Dst = MI->getOperand(0).getReg(); + if (!Dst || Dst.isVirtual()) + return; + + Register Src = MI->getOperand(1).getReg(); + if (regsAreCompatible(Dst, getMappedReg(Src, SrcRegMap), TRI)) + return; + } + + for (unsigned i = 0, NumOps = MI->getNumOperands(); i != NumOps; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isRegMask()) { + removeMapRegEntry(MO, SrcRegMap, TRI); + continue; + } + if (!MO.isReg() || !MO.isDef()) + continue; + Register Reg = MO.getReg(); + if (!Reg || Reg.isVirtual()) + continue; + removeMapRegEntry(MO, SrcRegMap, TRI); + } +} + // Returns true if Reg is equal or aliased to at least one register in Set. static bool regOverlapsSet(const SmallVectorImpl &Set, Register Reg, const TargetRegisterInfo *TRI) { @@ -656,9 +741,7 @@ VirtRegPairs.push_back(NewReg); break; } - bool isNew = SrcRegMap.insert(std::make_pair(NewReg, Reg)).second; - if (!isNew) - assert(SrcRegMap[NewReg] == Reg && "Can't map to two src registers!"); + SrcRegMap[NewReg] = Reg; VirtRegPairs.push_back(NewReg); Reg = NewReg; } @@ -1460,9 +1543,6 @@ // by SubRegB is compatible with RegA with no subregister. So regardless of // whether the dest oper writes a subreg, the source oper should not. MO.setSubReg(0); - - // Propagate SrcRegMap. - SrcRegMap[RegA] = RegB; } if (AllUsesCopied) { @@ -1493,6 +1573,9 @@ LV->addVirtualRegisterKilled(RegB, *PrevMI); } + if (RemovedKillFlag && ReplacedAllUntiedUses) + SrcRegMap[LastCopiedReg] = RegB; + // Update LiveIntervals. if (LIS) { LiveInterval &LI = LIS->getInterval(RegB); @@ -1579,6 +1662,7 @@ // First scan through all the tied register uses in this instruction // and record a list of pairs of tied operands for each register. if (!collectTiedOperands(&*mi, TiedOperands)) { + removeClobberedSrcRegMap(&*mi); mi = nmi; continue; } @@ -1603,6 +1687,7 @@ // The tied operands have been eliminated or shifted further down // the block to ease elimination. Continue processing with 'nmi'. TiedOperands.clear(); + removeClobberedSrcRegMap(&*mi); mi = nmi; continue; } @@ -1632,6 +1717,7 @@ // Clear TiedOperands here instead of at the top of the loop // since most instructions do not have tied operands. TiedOperands.clear(); + removeClobberedSrcRegMap(&*mi); mi = nmi; } } Index: llvm/test/CodeGen/ARM/ssat.ll =================================================================== --- llvm/test/CodeGen/ARM/ssat.ll +++ llvm/test/CodeGen/ARM/ssat.ll @@ -394,14 +394,14 @@ define i32 @no_sat_incorrect_constant(i32 %x) #0 { ; V4T-LABEL: no_sat_incorrect_constant: ; V4T: @ %bb.0: @ %entry -; V4T-NEXT: mov r2, #1065353216 +; V4T-NEXT: mov r1, #1065353216 ; V4T-NEXT: cmn r0, #8388608 -; V4T-NEXT: orr r2, r2, #-1073741824 -; V4T-NEXT: mov r1, r0 -; V4T-NEXT: orrlt r1, r2, #1 -; V4T-NEXT: ldr r2, .LCPI11_0 +; V4T-NEXT: orr r1, r1, #-1073741824 +; V4T-NEXT: mov r2, r0 +; V4T-NEXT: orrlt r2, r1, #1 +; V4T-NEXT: ldr r1, .LCPI11_0 ; V4T-NEXT: cmp r0, #8388608 -; V4T-NEXT: movge r1, r2 +; V4T-NEXT: movlt r1, r2 ; V4T-NEXT: mov r0, r1 ; V4T-NEXT: bx lr ; V4T-NEXT: .p2align 2 Index: llvm/test/CodeGen/ARM/usat.ll =================================================================== --- llvm/test/CodeGen/ARM/usat.ll +++ llvm/test/CodeGen/ARM/usat.ll @@ -566,12 +566,12 @@ define i32 @no_unsigned_sat_incorrect_compare(i32 %x, i32 %y) #0 { ; V4T-LABEL: no_unsigned_sat_incorrect_compare: ; V4T: @ %bb.0: @ %entry -; V4T-NEXT: ldr r2, .LCPI14_0 ; V4T-NEXT: cmp r1, #0 -; V4T-NEXT: mov r1, r0 -; V4T-NEXT: movmi r1, #0 +; V4T-NEXT: mov r2, r0 +; V4T-NEXT: movmi r2, #0 +; V4T-NEXT: ldr r1, .LCPI14_0 ; V4T-NEXT: cmp r0, #8388608 -; V4T-NEXT: movge r1, r2 +; V4T-NEXT: movlt r1, r2 ; V4T-NEXT: mov r0, r1 ; V4T-NEXT: bx lr ; V4T-NEXT: .p2align 2 @@ -581,12 +581,12 @@ ; ; V6-LABEL: no_unsigned_sat_incorrect_compare: ; V6: @ %bb.0: @ %entry -; V6-NEXT: ldr r2, .LCPI14_0 ; V6-NEXT: cmp r1, #0 -; V6-NEXT: mov r1, r0 -; V6-NEXT: movmi r1, #0 +; V6-NEXT: mov r2, r0 +; V6-NEXT: movmi r2, #0 +; V6-NEXT: ldr r1, .LCPI14_0 ; V6-NEXT: cmp r0, #8388608 -; V6-NEXT: movge r1, r2 +; V6-NEXT: movlt r1, r2 ; V6-NEXT: mov r0, r1 ; V6-NEXT: bx lr ; V6-NEXT: .p2align 2 Index: llvm/test/CodeGen/AVR/ctpop.ll =================================================================== --- llvm/test/CodeGen/AVR/ctpop.ll +++ llvm/test/CodeGen/AVR/ctpop.ll @@ -21,7 +21,6 @@ ; CHECK: add {{.*}}[[RESULT]], {{.*}}[[SCRATCH]] ; CHECK: mov {{.*}}[[SCRATCH]], {{.*}}[[RESULT]] ; CHECK: swap {{.*}}[[SCRATCH]] -; CHECK: add {{.*}}[[SCRATCH]], {{.*}}[[RESULT]] -; CHECK: andi {{.*}}[[SCRATCH]], 15 -; CHECK: mov {{.*}}[[RESULT]], {{.*}}[[SCRATCH]] +; CHECK: add {{.*}}[[RESULT]], {{.*}}[[SCRATCH]] +; CHECK: andi {{.*}}[[RESULT]], 15 ; CHECK: ret Index: llvm/test/CodeGen/AVR/hardware-mul.ll =================================================================== --- llvm/test/CodeGen/AVR/hardware-mul.ll +++ llvm/test/CodeGen/AVR/hardware-mul.ll @@ -14,16 +14,15 @@ define i16 @mult16(i16 %a, i16 %b) { ; CHECK-LABEL: mult16: ; CHECK: muls r22, r25 -; CHECK: mov r18, r0 +; CHECK: mov r20, r0 ; CHECK: mul r22, r24 -; CHECK: mov r19, r0 -; CHECK: mov r20, r1 +; CHECK: mov r21, r0 +; CHECK: mov r18, r1 ; CHECK: clr r1 -; CHECK: add r20, r18 +; CHECK: add r18, r20 ; CHECK: muls r23, r24 ; CHECK: clr r1 -; CHECK: mov r22, r0 -; CHECK: add r22, r20 +; CHECK: add r18, r0 ; :TODO: finish after reworking shift instructions %mul = mul nsw i16 %b, %a ret i16 %mul Index: llvm/test/CodeGen/SystemZ/int-cmp-57.ll =================================================================== --- llvm/test/CodeGen/SystemZ/int-cmp-57.ll +++ llvm/test/CodeGen/SystemZ/int-cmp-57.ll @@ -82,7 +82,7 @@ ; CHECK-LABEL: fun6: ; CHECK: afi ; CHECK-NEXT: chi -; CHECK-NEXT: locrlh +; CHECK-NEXT: locre bb: %tmp = add i32 %arg, -2147483648 %tmp1 = icmp eq i32 %tmp, 0 @@ -94,7 +94,7 @@ ; CHECK-LABEL: fun7: ; CHECK: afi ; CHECK-NEXT: chi -; CHECK-NEXT: locrle +; CHECK-NEXT: locrh bb: %tmp = add i32 %arg, -2147483648 %tmp1 = icmp sgt i32 %tmp, 0 Index: llvm/test/CodeGen/Thumb/pr35836_2.ll =================================================================== --- llvm/test/CodeGen/Thumb/pr35836_2.ll +++ llvm/test/CodeGen/Thumb/pr35836_2.ll @@ -36,20 +36,20 @@ %mul = mul i128 %add18, %add ret i128 %mul } -; CHECK: adds r5, r1, r7 +; CHECK: adds r5, r1, r6 ; CHECK: mov r5, r4 -; CHECK: adcs r5, r6 +; CHECK: adcs r5, r7 ; CHECK: ldr r5, [sp, #12] @ 4-byte Reload ; CHECK: adcs r2, r5 ; CHECK: ldr r5, [sp, #16] @ 4-byte Reload ; CHECK: adcs r3, r5 -; CHECK: adds r5, r1, r7 -; CHECK: adcs r4, r6 +; CHECK: adds r6, r1, r6 +; CHECK: adcs r4, r7 ; CHECK: ldr r1, [r0, #20] ; CHECK: str r1, [sp, #16] @ 4-byte Spill -; CHECK: ldr r6, [r0, #28] +; CHECK: ldr r5, [r0, #28] ; CHECK: ldr r1, [r0, #16] ; CHECK: ldr r7, [r0, #24] ; CHECK: adcs r7, r1 ; CHECK: ldr r0, [sp, #16] @ 4-byte Reload -; CHECK: adcs r6, r0 +; CHECK: adcs r5, r0 Index: llvm/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll =================================================================== --- llvm/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll +++ llvm/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll @@ -28,8 +28,9 @@ ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; CHECK-NEXT: movl %ebp, %esi ; CHECK-NEXT: calll callee@PLT -; CHECK-NEXT: leal (%eax,%ebx), %esi -; CHECK-NEXT: addl %ebp, %esi +; CHECK-NEXT: addl %eax, %ebx +; CHECK-NEXT: addl %ebp, %ebx +; CHECK-NEXT: movl %ebx, %esi ; CHECK-NEXT: addl $12, %esp ; CHECK-NEXT: retl %b1 = call x86_regcallcc i32 @callee(i32 %a0, i32 %b0, i32 %c0, i32 %d0, i32 %e0) Index: llvm/test/CodeGen/X86/abs.ll =================================================================== --- llvm/test/CodeGen/X86/abs.ll +++ llvm/test/CodeGen/X86/abs.ll @@ -132,14 +132,13 @@ define i128 @test_i128(i128 %a) nounwind { ; X64-LABEL: test_i128: ; X64: # %bb.0: -; X64-NEXT: movq %rsi, %rdx ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rsi, %rcx -; X64-NEXT: sarq $63, %rcx -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: xorq %rcx, %rax -; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: movq %rsi, %rdx +; X64-NEXT: sarq $63, %rdx +; X64-NEXT: addq %rdx, %rax +; X64-NEXT: adcq %rdx, %rsi +; X64-NEXT: xorq %rdx, %rax +; X64-NEXT: xorq %rsi, %rdx ; X64-NEXT: retq ; ; X86-LABEL: test_i128: Index: llvm/test/CodeGen/X86/add-cmov.ll =================================================================== --- llvm/test/CodeGen/X86/add-cmov.ll +++ llvm/test/CodeGen/X86/add-cmov.ll @@ -136,9 +136,9 @@ ; CHECK-LABEL: select_max32_2_i64: ; CHECK: # %bb.0: ; CHECK-NEXT: leaq 2(%rdi), %rax -; CHECK-NEXT: addq $2147483647, %rdi # imm = 0x7FFFFFFF +; CHECK-NEXT: 2147483647(%rdi), %rcx ; CHECK-NEXT: cmpq $41, %rsi -; CHECK-NEXT: cmovneq %rdi, %rax +; CHECK-NEXT: cmovneq %rcx, %rax ; CHECK-NEXT: retq %b = icmp ne i64 %x, 41 %s = select i1 %b, i64 2147483647, i64 2 @@ -209,10 +209,10 @@ ; CHECK-LABEL: select_20_43_i32: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: leal 43(%rdi), %eax -; CHECK-NEXT: addl $20, %edi +; CHECK-NEXT: leal 43(%rdi), %ecx +; CHECK-NEXT: 20(%rdi), %eax ; CHECK-NEXT: cmpq $42, %rsi -; CHECK-NEXT: cmovgel %edi, %eax +; CHECK-NEXT: cmovll %ecx, %eax ; CHECK-NEXT: retq %b = icmp sgt i64 %x, 41 %s = select i1 %b, i32 20, i32 43 @@ -224,10 +224,10 @@ ; CHECK-LABEL: select_n2_17_i16: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: leal 17(%rdi), %eax -; CHECK-NEXT: addl $65534, %edi # imm = 0xFFFE +; CHECK-NEXT: leal 17(%rdi), %ecx +; CHECK-NEXT: leal 65534(%rdi), %eax ; CHECK-NEXT: testb $1, %sil -; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: cmovel %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq %s = select i1 %b, i16 -2, i16 17 Index: llvm/test/CodeGen/X86/addsub-constant-folding.ll =================================================================== --- llvm/test/CodeGen/X86/addsub-constant-folding.ll +++ llvm/test/CodeGen/X86/addsub-constant-folding.ll @@ -51,7 +51,8 @@ ; X64-NEXT: movl %edi, %ebx ; X64-NEXT: leal 8(%rbx), %edi ; X64-NEXT: callq use@PLT -; X64-NEXT: leal 10(%rbx), %eax +; X64-NEXT: addl $10, %ebx +; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq @@ -173,7 +174,8 @@ ; X64-NEXT: movl %edi, %ebx ; X64-NEXT: leal 8(%rbx), %edi ; X64-NEXT: callq use@PLT -; X64-NEXT: leal 6(%rbx), %eax +; X64-NEXT: addl $6, %ebx +; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq @@ -427,7 +429,8 @@ ; X64-NEXT: movl %edi, %ebx ; X64-NEXT: leal -8(%rbx), %edi ; X64-NEXT: callq use@PLT -; X64-NEXT: leal -6(%rbx), %eax +; X64-NEXT: addl $-6, %ebx +; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq @@ -545,7 +548,8 @@ ; X64-NEXT: movl %edi, %ebx ; X64-NEXT: leal -8(%rbx), %edi ; X64-NEXT: callq use@PLT -; X64-NEXT: leal -10(%rbx), %eax +; X64-NEXT: addl $-10, %ebx +; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq Index: llvm/test/CodeGen/X86/align-down.ll =================================================================== --- llvm/test/CodeGen/X86/align-down.ll +++ llvm/test/CodeGen/X86/align-down.ll @@ -231,8 +231,8 @@ ; ; NOBMI-X64-LABEL: n8_not_lowbit_mask: ; NOBMI-X64: # %bb.0: -; NOBMI-X64-NEXT: movl %esi, %eax -; NOBMI-X64-NEXT: incl %eax +; NOBMI-X64-NEXT: # kill: def $esi killed $esi def $rsi +; NOBMI-X64-NEXT: leal 1(%rsi), %eax ; NOBMI-X64-NEXT: notl %eax ; NOBMI-X64-NEXT: andl %edi, %eax ; NOBMI-X64-NEXT: retq @@ -260,8 +260,8 @@ ; ; X64-LABEL: n9_sub_is_not_commutative: ; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: decl %eax +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: leal -1(%rsi), %eax ; X64-NEXT: andl %edi, %eax ; X64-NEXT: subl %edi, %eax ; X64-NEXT: retq Index: llvm/test/CodeGen/X86/arithmetic_fence2.ll =================================================================== --- llvm/test/CodeGen/X86/arithmetic_fence2.ll +++ llvm/test/CodeGen/X86/arithmetic_fence2.ll @@ -58,8 +58,7 @@ ; X64-NEXT: addsd %xmm0, %xmm0 ; X64-NEXT: movapd %xmm0, %xmm1 ; X64-NEXT: #ARITH_FENCE -; X64-NEXT: addsd %xmm0, %xmm1 -; X64-NEXT: movapd %xmm1, %xmm0 +; X64-NEXT: addsd %xmm1, %xmm0 ; X64-NEXT: retq %1 = fadd fast double %a, %a %t = call double @llvm.arithmetic.fence.f64(double %1) @@ -90,8 +89,7 @@ ; X86-NEXT: addps %xmm0, %xmm0 ; X86-NEXT: movaps %xmm0, %xmm1 ; X86-NEXT: #ARITH_FENCE -; X86-NEXT: addps %xmm0, %xmm1 -; X86-NEXT: movaps %xmm1, %xmm0 +; X86-NEXT: addps %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: f4: @@ -99,8 +97,7 @@ ; X64-NEXT: addps %xmm0, %xmm0 ; X64-NEXT: movaps %xmm0, %xmm1 ; X64-NEXT: #ARITH_FENCE -; X64-NEXT: addps %xmm0, %xmm1 -; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: addps %xmm1, %xmm0 ; X64-NEXT: retq %1 = fadd fast <2 x float> %a, %a %t = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> %1) @@ -138,10 +135,8 @@ ; X86-NEXT: #ARITH_FENCE ; X86-NEXT: movaps %xmm0, %xmm3 ; X86-NEXT: #ARITH_FENCE -; X86-NEXT: addps %xmm0, %xmm3 -; X86-NEXT: addps %xmm1, %xmm2 -; X86-NEXT: movaps %xmm3, %xmm0 -; X86-NEXT: movaps %xmm2, %xmm1 +; X86-NEXT: addps %xmm3, %xmm0 +; X86-NEXT: addps %xmm2, %xmm1 ; X86-NEXT: retl ; ; X64-LABEL: f6: @@ -152,10 +147,8 @@ ; X64-NEXT: #ARITH_FENCE ; X64-NEXT: movaps %xmm0, %xmm3 ; X64-NEXT: #ARITH_FENCE -; X64-NEXT: addps %xmm0, %xmm3 -; X64-NEXT: addps %xmm1, %xmm2 -; X64-NEXT: movaps %xmm3, %xmm0 -; X64-NEXT: movaps %xmm2, %xmm1 +; X64-NEXT: addps %xmm3, %xmm0 +; X64-NEXT: addps %xmm2, %xmm1 ; X64-NEXT: retq %1 = fadd fast <8 x float> %a, %a %t = call <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float> %1) Index: llvm/test/CodeGen/X86/avx512-inc-dec.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-inc-dec.ll +++ llvm/test/CodeGen/X86/avx512-inc-dec.ll @@ -4,10 +4,11 @@ define i32 @test(i32 %a, i32 %b) { ; CHECK-LABEL: test: ; CHECK: ## %bb.0: +; CHECK-NEXT: ## kill: def $esi killed $esi def $rsi ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: leal -1(%rdi), %eax -; CHECK-NEXT: addl $1, %esi -; CHECK-NEXT: imull %esi, %eax +; CHECK-NEXT: leal -1(%rdi), %ecx +; CHECK-NEXT: leal 1(%rsi), %eax +; CHECK-NEXT: imull %ecx, %eax ; CHECK-NEXT: retq %a1 = add i32 %a, -1 %b1 = add i32 %b, 1 Index: llvm/test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-mask-op.ll +++ llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -151,10 +151,9 @@ ; CHECK-LABEL: mand16: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: andl %esi, %ecx -; CHECK-NEXT: xorl %esi, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: andl %esi, %eax +; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: orl %edi, %eax ; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq ; Index: llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll +++ llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -939,49 +939,47 @@ ; X32: # %bb.0: ; X32-NEXT: pushl %ebp ; X32-NEXT: pushl %ebx -; X32-NEXT: subl $20, %esp -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: subl $12, %esp +; X32-NEXT: movl %esi, (%esp) # 4-byte Spill +; X32-NEXT: movl %edi, %esi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: leal (%edx,%esi), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: subl %ecx, %ebx +; X32-NEXT: subl %esi, %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: subl %ecx, %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: imull %eax, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %esi, %eax ; X32-NEXT: subl {{[0-9]+}}(%esp), %eax -; X32-NEXT: imull %eax, %ebx -; X32-NEXT: movl %edx, %eax -; X32-NEXT: subl %edi, %eax +; X32-NEXT: imull %ebx, %eax +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl (%esp), %ebx # 4-byte Reload +; X32-NEXT: subl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: imull %eax, %ecx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %esi -; X32-NEXT: subl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: subl {{[0-9]+}}(%esp), %eax -; X32-NEXT: imull %esi, %eax -; X32-NEXT: addl %eax, %ebx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl (%esp), %esi # 4-byte Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: addl {{[0-9]+}}(%esp), %edi +; X32-NEXT: imull %ebx, %ecx +; X32-NEXT: addl %eax, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: imull %eax, %ebp +; X32-NEXT: imull %ebp, %edi +; X32-NEXT: addl {{[0-9]+}}(%esp), %esi +; X32-NEXT: imull {{[0-9]+}}(%esp), %esi +; X32-NEXT: addl %esi, %edi ; X32-NEXT: addl {{[0-9]+}}(%esp), %edx -; X32-NEXT: imull %esi, %edx -; X32-NEXT: addl %ebp, %edx -; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: imull %edi, %ecx -; X32-NEXT: addl %edx, %ecx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: addl $20, %esp +; X32-NEXT: imull %eax, %edx +; X32-NEXT: addl %edx, %edi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: addl $12, %esp ; X32-NEXT: popl %ebx ; X32-NEXT: popl %ebp ; X32-NEXT: retl @@ -991,35 +989,47 @@ ; WIN64-NEXT: pushq %r13 ; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx -; WIN64-NEXT: movl %eax, %r13d -; WIN64-NEXT: subl %ecx, %eax -; WIN64-NEXT: movl %edx, %ebp -; WIN64-NEXT: subl %edi, %ebp -; WIN64-NEXT: movl %r9d, %ebx -; WIN64-NEXT: subl %r10d, %ebx -; WIN64-NEXT: imull %ebx, %eax +; WIN64-NEXT: # kill: def $edx killed $edx def $rdx +; WIN64-NEXT: movl %ecx, %ebx +; WIN64-NEXT: # kill: def $esi killed $esi def $rsi +; WIN64-NEXT: # kill: def $r15d killed $r15d def $r15 +; WIN64-NEXT: # kill: def $r14d killed $r14d def $r14 +; WIN64-NEXT: # kill: def $r12d killed $r12d def $r12 +; WIN64-NEXT: # kill: def $r11d killed $r11d def $r11 +; WIN64-NEXT: # kill: def $r10d killed $r10d def $r10 +; WIN64-NEXT: # kill: def $r9d killed $r9d def $r9 +; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8 +; WIN64-NEXT: # kill: def $edi killed $edi def $rdi +; WIN64-NEXT: leal (%rdx,%rdi), %r13d +; WIN64-NEXT: # kill: def $edx killed $edx killed $rdx +; WIN64-NEXT: subl %edi, %edx +; WIN64-NEXT: leal (%rsi,%r8), %ecx +; WIN64-NEXT: # kill: def $esi killed $esi killed $rsi +; WIN64-NEXT: subl %r8d, %esi +; WIN64-NEXT: leal (%r9,%r10), %r8d +; WIN64-NEXT: movl %r9d, %ebp +; WIN64-NEXT: subl %r10d, %ebp +; WIN64-NEXT: movl %eax, %edi +; WIN64-NEXT: movl %ebx, %r9d +; WIN64-NEXT: subl %ebx, %edi +; WIN64-NEXT: imull %edi, %ebp +; WIN64-NEXT: leal (%r11,%r12), %edi ; WIN64-NEXT: movl %r11d, %ebx ; WIN64-NEXT: subl %r12d, %ebx -; WIN64-NEXT: imull %ebp, %ebx -; WIN64-NEXT: movl %esi, %ebp -; WIN64-NEXT: subl %r8d, %ebp -; WIN64-NEXT: addl %ebx, %eax -; WIN64-NEXT: movl %r14d, %ebx -; WIN64-NEXT: subl %r15d, %ebx -; WIN64-NEXT: imull %ebp, %ebx -; WIN64-NEXT: addl %ebx, %eax -; WIN64-NEXT: addl %ecx, %r13d -; WIN64-NEXT: addl %edi, %edx -; WIN64-NEXT: addl %r8d, %esi -; WIN64-NEXT: addl %r10d, %r9d -; WIN64-NEXT: imull %r13d, %r9d -; WIN64-NEXT: addl %r12d, %r11d -; WIN64-NEXT: imull %edx, %r11d -; WIN64-NEXT: addl %r9d, %r11d -; WIN64-NEXT: addl %r15d, %r14d -; WIN64-NEXT: imull %esi, %r14d -; WIN64-NEXT: addl %r11d, %r14d -; WIN64-NEXT: addl %r14d, %eax +; WIN64-NEXT: imull %edx, %ebx +; WIN64-NEXT: addl %ebp, %ebx +; WIN64-NEXT: leal (%r14,%r15), %edx +; WIN64-NEXT: movl %r14d, %ebp +; WIN64-NEXT: subl %r15d, %ebp +; WIN64-NEXT: imull %esi, %ebp +; WIN64-NEXT: addl %ebx, %ebp +; WIN64-NEXT: addl %r9d, %eax +; WIN64-NEXT: imull %r8d, %eax +; WIN64-NEXT: imull %r13d, %edi +; WIN64-NEXT: addl %edi, %eax +; WIN64-NEXT: imull %ecx, %edx +; WIN64-NEXT: addl %edx, %eax +; WIN64-NEXT: addl %ebp, %eax ; WIN64-NEXT: popq %rbx ; WIN64-NEXT: popq %rbp ; WIN64-NEXT: popq %r13 @@ -1029,36 +1039,44 @@ ; LINUXOSX64: # %bb.0: ; LINUXOSX64-NEXT: pushq %rbp ; LINUXOSX64-NEXT: pushq %rbx -; LINUXOSX64-NEXT: movl %eax, %r10d -; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; LINUXOSX64-NEXT: subl %ecx, %eax -; LINUXOSX64-NEXT: movl %edx, %ebx -; LINUXOSX64-NEXT: subl %edi, %ebx -; LINUXOSX64-NEXT: movl %r9d, %ebp -; LINUXOSX64-NEXT: subl %r12d, %ebp -; LINUXOSX64-NEXT: imull %ebp, %eax -; LINUXOSX64-NEXT: movl %r13d, %ebp -; LINUXOSX64-NEXT: subl %r14d, %ebp -; LINUXOSX64-NEXT: imull %ebx, %ebp -; LINUXOSX64-NEXT: movl %esi, %ebx -; LINUXOSX64-NEXT: subl %r8d, %ebx -; LINUXOSX64-NEXT: addl %ebp, %eax -; LINUXOSX64-NEXT: movl %r15d, %ebp -; LINUXOSX64-NEXT: subl %r11d, %ebp -; LINUXOSX64-NEXT: imull %ebx, %ebp +; LINUXOSX64-NEXT: # kill: def $edx killed $edx def $rdx +; LINUXOSX64-NEXT: # kill: def $esi killed $esi def $rsi +; LINUXOSX64-NEXT: # kill: def $r14d killed $r14d def $r14 +; LINUXOSX64-NEXT: # kill: def $r13d killed $r13d def $r13 +; LINUXOSX64-NEXT: # kill: def $r12d killed $r12d def $r12 +; LINUXOSX64-NEXT: # kill: def $r9d killed $r9d def $r9 +; LINUXOSX64-NEXT: # kill: def $r8d killed $r8d def $r8 +; LINUXOSX64-NEXT: # kill: def $edi killed $edi def $rdi +; LINUXOSX64-NEXT: leal (%rdx,%rdi), %r10d +; LINUXOSX64-NEXT: movl %edx, %ebp +; LINUXOSX64-NEXT: subl %edi, %ebp +; LINUXOSX64-NEXT: leal (%rsi,%r8), %r11d +; LINUXOSX64-NEXT: # kill: def $esi killed $esi killed $rsi +; LINUXOSX64-NEXT: subl %r8d, %esi +; LINUXOSX64-NEXT: leal (%r9,%r12), %r8d +; LINUXOSX64-NEXT: movl %r9d, %edi +; LINUXOSX64-NEXT: subl %r12d, %edi +; LINUXOSX64-NEXT: movl %eax, %edx +; LINUXOSX64-NEXT: subl %ecx, %edx +; LINUXOSX64-NEXT: imull %edx, %edi +; LINUXOSX64-NEXT: leal (%r13,%r14), %edx +; LINUXOSX64-NEXT: movl %r13d, %ebx +; LINUXOSX64-NEXT: subl %r14d, %ebx +; LINUXOSX64-NEXT: imull %ebp, %ebx +; LINUXOSX64-NEXT: movl 24(%rsp), %ebp +; LINUXOSX64-NEXT: addl %edi, %ebx +; LINUXOSX64-NEXT: movl %r15d, %edi +; LINUXOSX64-NEXT: subl %ebp, %edi +; LINUXOSX64-NEXT: imull %esi, %edi +; LINUXOSX64-NEXT: addl %ebx, %edi +; LINUXOSX64-NEXT: addl %ecx, %eax +; LINUXOSX64-NEXT: imull %r8d, %eax +; LINUXOSX64-NEXT: imull %r10d, %edx +; LINUXOSX64-NEXT: addl %edx, %eax +; LINUXOSX64-NEXT: addl %r15d, %ebp +; LINUXOSX64-NEXT: imull %r11d, %ebp ; LINUXOSX64-NEXT: addl %ebp, %eax -; LINUXOSX64-NEXT: addl %ecx, %r10d -; LINUXOSX64-NEXT: addl %edi, %edx -; LINUXOSX64-NEXT: addl %r8d, %esi -; LINUXOSX64-NEXT: addl %r12d, %r9d -; LINUXOSX64-NEXT: imull %r10d, %r9d -; LINUXOSX64-NEXT: addl %r14d, %r13d -; LINUXOSX64-NEXT: imull %edx, %r13d -; LINUXOSX64-NEXT: addl %r9d, %r13d -; LINUXOSX64-NEXT: addl %r11d, %r15d -; LINUXOSX64-NEXT: imull %esi, %r15d -; LINUXOSX64-NEXT: addl %r13d, %r15d -; LINUXOSX64-NEXT: addl %r15d, %eax +; LINUXOSX64-NEXT: addl %edi, %eax ; LINUXOSX64-NEXT: popq %rbx ; LINUXOSX64-NEXT: popq %rbp ; LINUXOSX64-NEXT: retq Index: llvm/test/CodeGen/X86/avx512bw-mask-op.ll =================================================================== --- llvm/test/CodeGen/X86/avx512bw-mask-op.ll +++ llvm/test/CodeGen/X86/avx512bw-mask-op.ll @@ -79,10 +79,9 @@ ; CHECK-LABEL: mand32: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: andl %esi, %ecx -; CHECK-NEXT: xorl %esi, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: andl %esi, %eax +; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: orl %edi, %eax ; CHECK-NEXT: retq %ma = bitcast i32 %x to <32 x i1> %mb = bitcast i32 %y to <32 x i1> @@ -116,10 +115,9 @@ ; CHECK-LABEL: mand64: ; CHECK: ## %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq %rdi, %rcx -; CHECK-NEXT: andq %rsi, %rcx -; CHECK-NEXT: xorq %rsi, %rax -; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: xorq %rsi, %rdi +; CHECK-NEXT: orq %rdi, %rax ; CHECK-NEXT: retq %ma = bitcast i64 %x to <64 x i1> %mb = bitcast i64 %y to <64 x i1> Index: llvm/test/CodeGen/X86/avx512dq-mask-op.ll =================================================================== --- llvm/test/CodeGen/X86/avx512dq-mask-op.ll +++ llvm/test/CodeGen/X86/avx512dq-mask-op.ll @@ -33,11 +33,9 @@ ; CHECK-LABEL: mand8: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: andb %sil, %cl -; CHECK-NEXT: xorb %sil, %al -; CHECK-NEXT: orb %cl, %al -; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: andb %sil, %al +; CHECK-NEXT: xorb %sil, %dil +; CHECK-NEXT: orb %dil, %al ; CHECK-NEXT: retq %ma = bitcast i8 %x to <8 x i1> %mb = bitcast i8 %y to <8 x i1> Index: llvm/test/CodeGen/X86/bitreverse.ll =================================================================== --- llvm/test/CodeGen/X86/bitreverse.ll +++ llvm/test/CodeGen/X86/bitreverse.ll @@ -365,7 +365,6 @@ ; ; X64-LABEL: test_bitreverse_i8: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: rolb $4, %dil ; X64-NEXT: movl %edi, %eax ; X64-NEXT: andb $51, %al @@ -378,8 +377,7 @@ ; X64-NEXT: addb %al, %al ; X64-NEXT: shrb %dil ; X64-NEXT: andb $85, %dil -; X64-NEXT: addl %edi, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: orb %dil, %al ; X64-NEXT: retq ; ; X86XOP-LABEL: test_bitreverse_i8: @@ -417,7 +415,6 @@ ; ; X64-LABEL: test_bitreverse_i4: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: rolb $4, %dil ; X64-NEXT: movl %edi, %eax ; X64-NEXT: andb $51, %al @@ -430,9 +427,8 @@ ; X64-NEXT: addb %al, %al ; X64-NEXT: shrb %dil ; X64-NEXT: andb $80, %dil -; X64-NEXT: addl %edi, %eax +; X64-NEXT: orb %dil, %al ; X64-NEXT: shrb $4, %al -; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X86XOP-LABEL: test_bitreverse_i4: Index: llvm/test/CodeGen/X86/bswap_tree2.ll =================================================================== --- llvm/test/CodeGen/X86/bswap_tree2.ll +++ llvm/test/CodeGen/X86/bswap_tree2.ll @@ -24,17 +24,16 @@ ; ; CHECK64-LABEL: test1: ; CHECK64: # %bb.0: -; CHECK64-NEXT: movl %edi, %eax ; CHECK64-NEXT: movl %edi, %ecx ; CHECK64-NEXT: andl $16711680, %ecx # imm = 0xFF0000 -; CHECK64-NEXT: movl %edi, %edx -; CHECK64-NEXT: orl $-16777216, %edx # imm = 0xFF000000 +; CHECK64-NEXT: movl %edi, %eax +; CHECK64-NEXT: orl $-16777216, %eax # imm = 0xFF000000 ; CHECK64-NEXT: shll $8, %ecx -; CHECK64-NEXT: shrl $8, %edx -; CHECK64-NEXT: orl %ecx, %edx -; CHECK64-NEXT: bswapl %eax -; CHECK64-NEXT: shrl $16, %eax -; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: shrl $8, %eax +; CHECK64-NEXT: orl %ecx, %eax +; CHECK64-NEXT: bswapl %edi +; CHECK64-NEXT: shrl $16, %edi +; CHECK64-NEXT: orl %edi, %eax ; CHECK64-NEXT: retq %byte0 = and i32 %x, 255 ; 0x000000ff %byte1 = and i32 %x, 65280 ; 0x0000ff00 @@ -81,7 +80,7 @@ ; CHECK64-NEXT: andl $-16777216, %edi # imm = 0xFF000000 ; CHECK64-NEXT: andl $16711680, %eax # imm = 0xFF0000 ; CHECK64-NEXT: orl %edi, %eax -; CHECK64-NEXT: addl %ecx, %eax +; CHECK64-NEXT: orl %ecx, %eax ; CHECK64-NEXT: retq %byte1 = lshr i32 %x, 8 %byte0 = shl i32 %x, 8 Index: llvm/test/CodeGen/X86/cmp-concat.ll =================================================================== --- llvm/test/CodeGen/X86/cmp-concat.ll +++ llvm/test/CodeGen/X86/cmp-concat.ll @@ -81,8 +81,9 @@ ; CHECK-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 ; CHECK-NEXT: pcmpeqw %xmm0, %xmm1 -; CHECK-NEXT: pcmpeqw %xmm2, %xmm0 -; CHECK-NEXT: packsswb %xmm1, %xmm0 +; CHECK-NEXT: pcmpeqw %xmm0, %xmm2 +; CHECK-NEXT: packsswb %xmm1, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm0 ; CHECK-NEXT: retq %zx = zext <16 x i8> %x to <16 x i16> %zy = zext <16 x i8> %y to <16 x i16> Index: llvm/test/CodeGen/X86/combine-mul.ll =================================================================== --- llvm/test/CodeGen/X86/combine-mul.ll +++ llvm/test/CodeGen/X86/combine-mul.ll @@ -82,12 +82,11 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psllq $1, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psllq $4, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psllq $4, %xmm2 ; SSE-NEXT: psllq $2, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_mul_pow2c: Index: llvm/test/CodeGen/X86/combine-mulo.ll =================================================================== --- llvm/test/CodeGen/X86/combine-mulo.ll +++ llvm/test/CodeGen/X86/combine-mulo.ll @@ -37,8 +37,7 @@ ; SSE-NEXT: paddd %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE-NEXT: pxor %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm0 ; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq Index: llvm/test/CodeGen/X86/combine-or.ll =================================================================== --- llvm/test/CodeGen/X86/combine-or.ll +++ llvm/test/CodeGen/X86/combine-or.ll @@ -240,8 +240,9 @@ ; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] -; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; CHECK-NEXT: por %xmm0, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm0 ; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> Index: llvm/test/CodeGen/X86/combine-sdiv.ll =================================================================== --- llvm/test/CodeGen/X86/combine-sdiv.ll +++ llvm/test/CodeGen/X86/combine-sdiv.ll @@ -201,15 +201,15 @@ ; SSE41-LABEL: combine_vec_sdiv_by_pos1: ; SSE41: # %bb.0: ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $4, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrld $2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrld $3, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: psrld $4, %xmm0 -; SSE41-NEXT: psrld $2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pos1: @@ -246,9 +246,8 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psrad $31, %xmm1 ; SSE-NEXT: psrld $30, %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: psrad $2, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: psrad $2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_sdiv_by_pow2a: @@ -489,8 +488,7 @@ ; SSE41-NEXT: psraw $1, %xmm2 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i16: @@ -611,25 +609,23 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psraw $15, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = -; SSE41-NEXT: pmulhuw %xmm4, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = +; SSE41-NEXT: pmulhuw %xmm3, %xmm2 ; SSE41-NEXT: paddw %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pmulhw %xmm5, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = +; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: pmulhw %xmm4, %xmm5 ; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6],xmm2[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psraw $15, %xmm3 -; SSE41-NEXT: pmulhuw %xmm4, %xmm3 -; SSE41-NEXT: paddw %xmm1, %xmm3 -; SSE41-NEXT: pmulhw %xmm3, %xmm5 -; SSE41-NEXT: psraw $1, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4,5,6],xmm3[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psraw $15, %xmm2 +; SSE41-NEXT: pmulhuw %xmm3, %xmm2 +; SSE41-NEXT: paddw %xmm1, %xmm2 +; SSE41-NEXT: pmulhw %xmm2, %xmm4 +; SSE41-NEXT: psraw $1, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5,6],xmm2[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i16: @@ -825,46 +821,43 @@ ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v32i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psraw $15, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = -; SSE41-NEXT: pmulhuw %xmm7, %xmm0 -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pmulhw %xmm6, %xmm5 -; SSE41-NEXT: psraw $1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3,4,5,6],xmm0[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: psraw $15, %xmm1 -; SSE41-NEXT: pmulhuw %xmm7, %xmm1 -; SSE41-NEXT: paddw %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: pmulhw %xmm6, %xmm5 -; SSE41-NEXT: psraw $1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4,5,6],xmm1[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psraw $15, %xmm4 -; SSE41-NEXT: pmulhuw %xmm7, %xmm4 -; SSE41-NEXT: paddw %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pmulhw %xmm6, %xmm5 -; SSE41-NEXT: psraw $1, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm3, %xmm5 -; SSE41-NEXT: psraw $15, %xmm5 -; SSE41-NEXT: pmulhuw %xmm7, %xmm5 -; SSE41-NEXT: paddw %xmm3, %xmm5 -; SSE41-NEXT: pmulhw %xmm5, %xmm6 -; SSE41-NEXT: psraw $1, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: psraw $15, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = +; SSE41-NEXT: pmulhuw %xmm5, %xmm6 +; SSE41-NEXT: paddw %xmm0, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pmulhw %xmm4, %xmm7 +; SSE41-NEXT: psraw $1, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: psraw $15, %xmm6 +; SSE41-NEXT: pmulhuw %xmm5, %xmm6 +; SSE41-NEXT: paddw %xmm1, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pmulhw %xmm4, %xmm7 +; SSE41-NEXT: psraw $1, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: psraw $15, %xmm6 +; SSE41-NEXT: pmulhuw %xmm5, %xmm6 +; SSE41-NEXT: paddw %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pmulhw %xmm4, %xmm7 +; SSE41-NEXT: psraw $1, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: psraw $15, %xmm6 +; SSE41-NEXT: pmulhuw %xmm5, %xmm6 +; SSE41-NEXT: paddw %xmm3, %xmm6 +; SSE41-NEXT: pmulhw %xmm6, %xmm4 +; SSE41-NEXT: psraw $1, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1],xmm6[2],xmm4[3,4,5,6],xmm6[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16: @@ -1044,8 +1037,7 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] ; SSE41-NEXT: psrad $3, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32: @@ -1131,23 +1123,23 @@ ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrld $28, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: psrld $30, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] -; SSE41-NEXT: paddd %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrld $29, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; SSE41-NEXT: paddd %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrad $4, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: psrad $2, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: psrad $3, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7] +; SSE41-NEXT: psrad $3, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm3 @@ -1165,8 +1157,7 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; SSE41-NEXT: psrad $3, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i32: @@ -1311,43 +1302,42 @@ ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: psrld $28, %xmm5 -; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: psrld $30, %xmm6 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: psrld $29, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] +; SSE41-NEXT: paddd %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: psrad $4, %xmm5 -; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: psrad $2, %xmm6 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: psrad $3, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: psrad $3, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: psrld $28, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: psrld $30, %xmm6 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] -; SSE41-NEXT: paddd %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: psrld $29, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] +; SSE41-NEXT: paddd %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: psrad $4, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: psrad $2, %xmm6 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: psrad $3, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7] +; SSE41-NEXT: psrad $3, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: psrad $31, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm5 @@ -1365,27 +1355,25 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: psrad $3, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm3, %xmm5 -; SSE41-NEXT: psrad $31, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: psrld $28, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: psrld $28, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: psrld $30, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] -; SSE41-NEXT: paddd %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: psrad $4, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: psrld $29, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] +; SSE41-NEXT: paddd %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: psrad $4, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: psrad $2, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: psrad $3, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: psrad $3, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i32: @@ -1545,8 +1533,7 @@ ; SSE41-NEXT: psrad $2, %xmm2 ; SSE41-NEXT: psrlq $2, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64: @@ -1640,14 +1627,14 @@ ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: psrlq $62, %xmm0 -; SSE41-NEXT: paddq %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: psrlq $62, %xmm2 +; SSE41-NEXT: paddq %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrad $2, %xmm3 -; SSE41-NEXT: psrlq $2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: psrlq $2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] @@ -1803,26 +1790,25 @@ ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: psrlq $62, %xmm0 -; SSE41-NEXT: paddq %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad $2, %xmm2 -; SSE41-NEXT: psrlq $2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: psrlq $62, %xmm2 -; SSE41-NEXT: paddq %xmm5, %xmm2 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: psrlq $62, %xmm1 +; SSE41-NEXT: paddq %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: psrad $2, %xmm5 +; SSE41-NEXT: psrlq $2, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psrad $2, %xmm1 -; SSE41-NEXT: psrlq $2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: psrlq $62, %xmm1 +; SSE41-NEXT: paddq %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: psrad $2, %xmm5 +; SSE41-NEXT: psrlq $2, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] @@ -2027,9 +2013,8 @@ ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: psubd %xmm3, %xmm2 ; SSE41-NEXT: psrad $3, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: @@ -2358,8 +2343,7 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psubd %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: non_splat_minus_one_divisor_2: @@ -2404,8 +2388,7 @@ ; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psrlw $15, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_sdiv_nonuniform: @@ -2817,14 +2800,13 @@ ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <4,256,256,u,u,512,256,8> -; SSE41-NEXT: pmulhw %xmm0, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <4,256,256,u,u,512,256,8> +; SSE41-NEXT: pmulhw %xmm0, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] ; SSE41-NEXT: psrlw $15, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] -; SSE41-NEXT: paddw %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] +; SSE41-NEXT: paddw %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_nonuniform6: @@ -3093,7 +3075,6 @@ define i32 @combine_sdiv_two(i32 %x) { ; CHECK-LABEL: combine_sdiv_two: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: shrl $31, %eax ; CHECK-NEXT: addl %edi, %eax @@ -3106,7 +3087,6 @@ define i32 @combine_sdiv_negtwo(i32 %x) { ; CHECK-LABEL: combine_sdiv_negtwo: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: shrl $31, %eax ; CHECK-NEXT: addl %edi, %eax @@ -3120,13 +3100,11 @@ define i8 @combine_i8_sdiv_pow2(i8 %x) { ; CHECK-LABEL: combine_i8_sdiv_pow2: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: sarb $7, %al ; CHECK-NEXT: shrb $4, %al -; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: addb %dil, %al ; CHECK-NEXT: sarb $4, %al -; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq %1 = sdiv i8 %x, 16 ret i8 %1 @@ -3135,14 +3113,12 @@ define i8 @combine_i8_sdiv_negpow2(i8 %x) { ; CHECK-LABEL: combine_i8_sdiv_negpow2: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: sarb $7, %al ; CHECK-NEXT: shrb $2, %al -; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: addb %dil, %al ; CHECK-NEXT: sarb $6, %al ; CHECK-NEXT: negb %al -; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq %1 = sdiv i8 %x, -64 ret i8 %1 Index: llvm/test/CodeGen/X86/combine-smax.ll =================================================================== --- llvm/test/CodeGen/X86/combine-smax.ll +++ llvm/test/CodeGen/X86/combine-smax.ll @@ -12,24 +12,24 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pmaxub %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pmaxub %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16i8_nosignbit: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pmaxsb %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: pmaxsb %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; SSE42-LABEL: test_v16i8_nosignbit: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE42-NEXT: pand %xmm2, %xmm0 -; SSE42-NEXT: pand %xmm2, %xmm1 -; SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; SSE42-NEXT: pand %xmm1, %xmm2 +; SSE42-NEXT: pmaxsb %xmm2, %xmm0 ; SSE42-NEXT: retq ; ; AVX-LABEL: test_v16i8_nosignbit: Index: llvm/test/CodeGen/X86/combine-smin.ll =================================================================== --- llvm/test/CodeGen/X86/combine-smin.ll +++ llvm/test/CodeGen/X86/combine-smin.ll @@ -12,24 +12,24 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pminub %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pminub %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16i8_nosignbit: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pminsb %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: pminsb %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; SSE42-LABEL: test_v16i8_nosignbit: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE42-NEXT: pand %xmm2, %xmm0 -; SSE42-NEXT: pand %xmm2, %xmm1 -; SSE42-NEXT: pminsb %xmm1, %xmm0 +; SSE42-NEXT: pand %xmm1, %xmm2 +; SSE42-NEXT: pminsb %xmm2, %xmm0 ; SSE42-NEXT: retq ; ; AVX-LABEL: test_v16i8_nosignbit: Index: llvm/test/CodeGen/X86/combine-sra.ll =================================================================== --- llvm/test/CodeGen/X86/combine-sra.ll +++ llvm/test/CodeGen/X86/combine-sra.ll @@ -210,9 +210,8 @@ ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: psrad $1, %xmm0 ; SSE-NEXT: psrad $3, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] ; SSE-NEXT: retq ; ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr: @@ -283,9 +282,8 @@ ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: psrad $1, %xmm0 ; SSE-NEXT: psrad $3, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] ; SSE-NEXT: retq ; ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr: Index: llvm/test/CodeGen/X86/combine-srem.ll =================================================================== --- llvm/test/CodeGen/X86/combine-srem.ll +++ llvm/test/CodeGen/X86/combine-srem.ll @@ -333,8 +333,7 @@ ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; SSE-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: combine_vec_srem_by_pow2b_neg: Index: llvm/test/CodeGen/X86/combine-srl.ll =================================================================== --- llvm/test/CodeGen/X86/combine-srl.ll +++ llvm/test/CodeGen/X86/combine-srl.ll @@ -346,29 +346,28 @@ ; SSE-LABEL: combine_vec_lshr_lzcnt_bit1: ; SSE: # %bb.0: ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pshufb %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrlw $4, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pshufb %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrlw $4, %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: pshufb %xmm1, %xmm2 +; SSE-NEXT: pshufb %xmm3, %xmm1 +; SSE-NEXT: pcmpeqb %xmm4, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: paddb %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pcmpeqb %xmm4, %xmm2 -; SSE-NEXT: psrlw $8, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: psrlw $8, %xmm1 -; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: psrlw $8, %xmm3 +; SSE-NEXT: paddw %xmm1, %xmm3 ; SSE-NEXT: pcmpeqw %xmm4, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: psrld $5, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: paddd %xmm3, %xmm0 +; SSE-NEXT: psrld $5, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_lshr_lzcnt_bit1: Index: llvm/test/CodeGen/X86/combine-udiv.ll =================================================================== --- llvm/test/CodeGen/X86/combine-udiv.ll +++ llvm/test/CodeGen/X86/combine-udiv.ll @@ -189,15 +189,15 @@ ; ; SSE41-LABEL: combine_vec_udiv_by_pow2b: ; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $4, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrld $2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrld $3, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: psrld $4, %xmm0 -; SSE41-NEXT: psrld $2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_udiv_by_pow2b: @@ -559,11 +559,11 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) { ; SSE2-LABEL: combine_vec_udiv_nonuniform2: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -574,10 +574,9 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrlw $1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] -; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_vec_udiv_nonuniform2: @@ -633,9 +632,9 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; SSE2-LABEL: combine_vec_udiv_nonuniform4: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -643,7 +642,7 @@ ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: psrlw $7, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -693,35 +692,35 @@ define <8 x i16> @pr38477(<8 x i16> %a0) { ; SSE2-LABEL: pr38477: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4957,57457,4103,16385,35545,2048,2115] -; SSE2-NEXT: pmulhuw %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: psubw %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4957,57457,4103,16385,35545,2048,2115] +; SSE2-NEXT: pmulhuw %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: psubw %xmm2, %xmm0 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: paddw %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm0, %xmm3 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: pr38477: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4957,57457,4103,16385,35545,2048,2115] -; SSE41-NEXT: pmulhuw %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psubw %xmm2, %xmm1 -; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: paddw %xmm2, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = -; SSE41-NEXT: pmulhuw %xmm1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4957,57457,4103,16385,35545,2048,2115] +; SSE41-NEXT: pmulhuw %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psubw %xmm1, %xmm2 +; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: paddw %xmm1, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = +; SSE41-NEXT: pmulhuw %xmm2, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6],xmm1[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: pr38477: Index: llvm/test/CodeGen/X86/ctpop-combine.ll =================================================================== --- llvm/test/CodeGen/X86/ctpop-combine.ll +++ llvm/test/CodeGen/X86/ctpop-combine.ll @@ -88,7 +88,6 @@ ; ; NO-POPCOUNT-LABEL: test4: ; NO-POPCOUNT: # %bb.0: -; NO-POPCOUNT-NEXT: # kill: def $edi killed $edi def $rdi ; NO-POPCOUNT-NEXT: andb $127, %dil ; NO-POPCOUNT-NEXT: movl %edi, %eax ; NO-POPCOUNT-NEXT: shrb %al @@ -101,9 +100,8 @@ ; NO-POPCOUNT-NEXT: addb %al, %dil ; NO-POPCOUNT-NEXT: movl %edi, %eax ; NO-POPCOUNT-NEXT: shrb $4, %al -; NO-POPCOUNT-NEXT: addl %edi, %eax +; NO-POPCOUNT-NEXT: addb %dil, %al ; NO-POPCOUNT-NEXT: andb $15, %al -; NO-POPCOUNT-NEXT: # kill: def $al killed $al killed $eax ; NO-POPCOUNT-NEXT: retq %x2 = and i8 %x, 127 %count = tail call i8 @llvm.ctpop.i8(i8 %x2) Index: llvm/test/CodeGen/X86/fshl.ll =================================================================== --- llvm/test/CodeGen/X86/fshl.ll +++ llvm/test/CodeGen/X86/fshl.ll @@ -831,27 +831,26 @@ ; X64-FAST-LABEL: var_shift_i128: ; X64-FAST: # %bb.0: ; X64-FAST-NEXT: movq %r8, %r9 -; X64-FAST-NEXT: movq %rcx, %r10 -; X64-FAST-NEXT: movq %rdx, %r8 -; X64-FAST-NEXT: movq %rsi, %rdx +; X64-FAST-NEXT: movq %rcx, %r8 ; X64-FAST-NEXT: movl %r9d, %ecx -; X64-FAST-NEXT: shldq %cl, %rdi, %rdx -; X64-FAST-NEXT: shrdq $1, %r10, %r8 -; X64-FAST-NEXT: shrq %r10 +; X64-FAST-NEXT: shldq %cl, %rdi, %rsi +; X64-FAST-NEXT: shrdq $1, %r8, %rdx +; X64-FAST-NEXT: shrq %r8 ; X64-FAST-NEXT: notb %cl -; X64-FAST-NEXT: shrdq %cl, %r10, %r8 -; X64-FAST-NEXT: shrq %cl, %r10 +; X64-FAST-NEXT: shrdq %cl, %r8, %rdx +; X64-FAST-NEXT: shrq %cl, %r8 ; X64-FAST-NEXT: xorl %eax, %eax ; X64-FAST-NEXT: testb $64, %cl -; X64-FAST-NEXT: cmovneq %r10, %r8 -; X64-FAST-NEXT: cmovneq %rax, %r10 +; X64-FAST-NEXT: cmovneq %r8, %rdx +; X64-FAST-NEXT: cmovneq %rax, %r8 ; X64-FAST-NEXT: movl %r9d, %ecx ; X64-FAST-NEXT: shlq %cl, %rdi ; X64-FAST-NEXT: testb $64, %r9b -; X64-FAST-NEXT: cmovneq %rdi, %rdx +; X64-FAST-NEXT: cmovneq %rdi, %rsi ; X64-FAST-NEXT: cmoveq %rdi, %rax -; X64-FAST-NEXT: orq %r8, %rax -; X64-FAST-NEXT: orq %r10, %rdx +; X64-FAST-NEXT: orq %rdx, %rax +; X64-FAST-NEXT: orq %rsi, %r8 +; X64-FAST-NEXT: movq %r8, %rdx ; X64-FAST-NEXT: retq ; ; X64-SLOW-LABEL: var_shift_i128: Index: llvm/test/CodeGen/X86/fshr.ll =================================================================== --- llvm/test/CodeGen/X86/fshr.ll +++ llvm/test/CodeGen/X86/fshr.ll @@ -839,28 +839,27 @@ ; ; X64-FAST-LABEL: var_shift_i128: ; X64-FAST: # %bb.0: -; X64-FAST-NEXT: movq %r8, %r10 -; X64-FAST-NEXT: movq %rcx, %r9 -; X64-FAST-NEXT: movq %rdx, %r8 -; X64-FAST-NEXT: movq %rsi, %rdx -; X64-FAST-NEXT: movl %r10d, %ecx -; X64-FAST-NEXT: shrdq %cl, %r9, %r8 -; X64-FAST-NEXT: shrq %cl, %r9 +; X64-FAST-NEXT: movq %r8, %r9 +; X64-FAST-NEXT: movq %rcx, %r8 +; X64-FAST-NEXT: movl %r9d, %ecx +; X64-FAST-NEXT: shrdq %cl, %r8, %rdx +; X64-FAST-NEXT: shrq %cl, %r8 ; X64-FAST-NEXT: xorl %eax, %eax -; X64-FAST-NEXT: testb $64, %r10b -; X64-FAST-NEXT: cmovneq %r9, %r8 -; X64-FAST-NEXT: cmovneq %rax, %r9 -; X64-FAST-NEXT: shldq $1, %rdi, %rdx +; X64-FAST-NEXT: testb $64, %r9b +; X64-FAST-NEXT: cmovneq %r8, %rdx +; X64-FAST-NEXT: cmovneq %rax, %r8 +; X64-FAST-NEXT: shldq $1, %rdi, %rsi ; X64-FAST-NEXT: addq %rdi, %rdi -; X64-FAST-NEXT: notb %r10b -; X64-FAST-NEXT: movl %r10d, %ecx -; X64-FAST-NEXT: shldq %cl, %rdi, %rdx +; X64-FAST-NEXT: notb %r9b +; X64-FAST-NEXT: movl %r9d, %ecx +; X64-FAST-NEXT: shldq %cl, %rdi, %rsi ; X64-FAST-NEXT: shlq %cl, %rdi -; X64-FAST-NEXT: testb $64, %r10b -; X64-FAST-NEXT: cmovneq %rdi, %rdx +; X64-FAST-NEXT: testb $64, %r9b +; X64-FAST-NEXT: cmovneq %rdi, %rsi ; X64-FAST-NEXT: cmoveq %rdi, %rax -; X64-FAST-NEXT: orq %r8, %rax -; X64-FAST-NEXT: orq %r9, %rdx +; X64-FAST-NEXT: orq %rdx, %rax +; X64-FAST-NEXT: orq %rsi, %r8 +; X64-FAST-NEXT: movq %r8, %rdx ; X64-FAST-NEXT: retq ; ; X64-SLOW-LABEL: var_shift_i128: Index: llvm/test/CodeGen/X86/funnel-shift.ll =================================================================== --- llvm/test/CodeGen/X86/funnel-shift.ll +++ llvm/test/CodeGen/X86/funnel-shift.ll @@ -264,27 +264,26 @@ ; X64-AVX2-LABEL: fshl_i128: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: movq %r8, %r9 -; X64-AVX2-NEXT: movq %rcx, %r10 -; X64-AVX2-NEXT: movq %rdx, %r8 -; X64-AVX2-NEXT: movq %rsi, %rdx +; X64-AVX2-NEXT: movq %rcx, %r8 ; X64-AVX2-NEXT: movl %r9d, %ecx -; X64-AVX2-NEXT: shldq %cl, %rdi, %rdx -; X64-AVX2-NEXT: shrdq $1, %r10, %r8 -; X64-AVX2-NEXT: shrq %r10 +; X64-AVX2-NEXT: shldq %cl, %rdi, %rsi +; X64-AVX2-NEXT: shrdq $1, %r8, %rdx +; X64-AVX2-NEXT: shrq %r8 ; X64-AVX2-NEXT: notb %cl -; X64-AVX2-NEXT: shrdq %cl, %r10, %r8 -; X64-AVX2-NEXT: shrq %cl, %r10 +; X64-AVX2-NEXT: shrdq %cl, %r8, %rdx +; X64-AVX2-NEXT: shrq %cl, %r8 ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: testb $64, %cl -; X64-AVX2-NEXT: cmovneq %r10, %r8 -; X64-AVX2-NEXT: cmovneq %rax, %r10 +; X64-AVX2-NEXT: cmovneq %r8, %rdx +; X64-AVX2-NEXT: cmovneq %rax, %r8 ; X64-AVX2-NEXT: movl %r9d, %ecx ; X64-AVX2-NEXT: shlq %cl, %rdi ; X64-AVX2-NEXT: testb $64, %r9b -; X64-AVX2-NEXT: cmovneq %rdi, %rdx +; X64-AVX2-NEXT: cmovneq %rdi, %rsi ; X64-AVX2-NEXT: cmoveq %rdi, %rax -; X64-AVX2-NEXT: orq %r8, %rax -; X64-AVX2-NEXT: orq %r10, %rdx +; X64-AVX2-NEXT: orq %rdx, %rax +; X64-AVX2-NEXT: orq %rsi, %r8 +; X64-AVX2-NEXT: movq %r8, %rdx ; X64-AVX2-NEXT: retq %f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z) ret i128 %f Index: llvm/test/CodeGen/X86/haddsub-shuf.ll =================================================================== --- llvm/test/CodeGen/X86/haddsub-shuf.ll +++ llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -1296,10 +1296,10 @@ ; SSE3-NEXT: movaps %xmm0, %xmm5 ; SSE3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] -; SSE3-NEXT: paddd %xmm4, %xmm2 +; SSE3-NEXT: paddd %xmm2, %xmm4 ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; SSE3-NEXT: paddd %xmm5, %xmm0 -; SSE3-NEXT: movdqa %xmm2, %xmm1 +; SSE3-NEXT: movdqa %xmm4, %xmm1 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: hadd_8i32_v8i32_shuffle: Index: llvm/test/CodeGen/X86/haddsub-undef.ll =================================================================== --- llvm/test/CodeGen/X86/haddsub-undef.ll +++ llvm/test/CodeGen/X86/haddsub-undef.ll @@ -117,8 +117,7 @@ ; SSE-SLOW: # %bb.0: ; SSE-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-SLOW-NEXT: addsd %xmm0, %xmm1 -; SSE-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: test5_undef: @@ -442,8 +441,7 @@ ; SSE-SLOW: # %bb.0: ; SSE-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] -; SSE-SLOW-NEXT: addpd %xmm0, %xmm1 -; SSE-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE-SLOW-NEXT: addpd %xmm1, %xmm0 ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: add_pd_003_2: @@ -676,9 +674,8 @@ ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-SLOW-NEXT: movapd %xmm0, %xmm3 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-SLOW-NEXT: addpd %xmm0, %xmm3 +; SSE-SLOW-NEXT: addpd %xmm3, %xmm0 ; SSE-SLOW-NEXT: addpd %xmm2, %xmm1 -; SSE-SLOW-NEXT: movapd %xmm3, %xmm0 ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: add_pd_011: @@ -965,8 +962,8 @@ ; SSE-SLOW-LABEL: PR45747_2: ; SSE-SLOW: # %bb.0: ; SSE-SLOW-NEXT: movaps %xmm1, %xmm0 -; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-SLOW-NEXT: addps %xmm1, %xmm0 ; SSE-SLOW-NEXT: retq ; Index: llvm/test/CodeGen/X86/haddsub.ll =================================================================== --- llvm/test/CodeGen/X86/haddsub.ll +++ llvm/test/CodeGen/X86/haddsub.ll @@ -45,8 +45,7 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addpd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addpd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: haddpd3: @@ -584,8 +583,7 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64: @@ -614,8 +612,7 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute: @@ -861,8 +858,8 @@ ; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32: ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 -; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -963,8 +960,8 @@ ; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute: ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 -; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -1001,8 +998,7 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64: @@ -1067,8 +1063,7 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute: @@ -1371,8 +1366,7 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64: @@ -1403,8 +1397,7 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute: @@ -1786,8 +1779,7 @@ ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE3-SLOW-NEXT: addps %xmm0, %xmm1 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE3-SLOW-NEXT: addss %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: hadd32_4: @@ -1828,8 +1820,7 @@ ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE3-SLOW-NEXT: addps %xmm0, %xmm1 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE3-SLOW-NEXT: addss %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: hadd32_8: @@ -1872,8 +1863,7 @@ ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE3-SLOW-NEXT: addps %xmm0, %xmm1 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE3-SLOW-NEXT: addss %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: hadd32_16: @@ -2064,8 +2054,7 @@ ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE3-SLOW-NEXT: addps %xmm0, %xmm1 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE3-SLOW-NEXT: addss %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32: @@ -2110,8 +2099,7 @@ ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE3-SLOW-NEXT: addps %xmm0, %xmm1 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE3-SLOW-NEXT: addss %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags: @@ -2154,8 +2142,7 @@ ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE3-SLOW-NEXT: addps %xmm0, %xmm1 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE3-SLOW-NEXT: addss %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; ; SSE3-FAST-LABEL: partial_reduction_fadd_v16f32: Index: llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll =================================================================== --- llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll +++ llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll @@ -25,8 +25,7 @@ ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSSE3-SLOW-NEXT: addss %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: PR37890_v4f32: @@ -71,8 +70,7 @@ ; SSE2-NEXT: addpd %xmm1, %xmm0 ; SSE2-NEXT: movapd %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addsd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: addsd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-SLOW-LABEL: PR37890_v4f64: @@ -80,8 +78,7 @@ ; SSSE3-SLOW-NEXT: addpd %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSSE3-SLOW-NEXT: addsd %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: PR37890_v4f64: @@ -143,8 +140,7 @@ ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSSE3-SLOW-NEXT: addss %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: PR37890_v8f32: Index: llvm/test/CodeGen/X86/horizontal-sum.ll =================================================================== --- llvm/test/CodeGen/X86/horizontal-sum.ll +++ llvm/test/CodeGen/X86/horizontal-sum.ll @@ -980,28 +980,26 @@ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSSE3-SLOW-NEXT: addps %xmm0, %xmm4 -; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSSE3-FAST-NEXT: addps %xmm0, %xmm4 -; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 -; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4 -; SSSE3-FAST-NEXT: movaps %xmm2, %xmm0 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSSE3-FAST-NEXT: addps %xmm2, %xmm0 -; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSSE3-FAST-NEXT: addps %xmm3, %xmm1 -; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,2] -; SSSE3-FAST-NEXT: movaps %xmm4, %xmm0 +; SSSE3-FAST-NEXT: addps %xmm4, %xmm0 +; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSSE3-FAST-NEXT: addps %xmm1, %xmm4 +; SSSE3-FAST-NEXT: haddps %xmm4, %xmm0 +; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSSE3-FAST-NEXT: addps %xmm2, %xmm1 +; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSSE3-FAST-NEXT: addps %xmm3, %xmm2 +; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc: Index: llvm/test/CodeGen/X86/i128-mul.ll =================================================================== --- llvm/test/CodeGen/X86/i128-mul.ll +++ llvm/test/CodeGen/X86/i128-mul.ll @@ -13,24 +13,23 @@ ; X86-NOBMI-NEXT: pushl %ebx ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOBMI-NEXT: movl %esi, %eax -; X86-NOBMI-NEXT: mull %ebx -; X86-NOBMI-NEXT: movl %edx, %edi -; X86-NOBMI-NEXT: movl %ebp, %eax -; X86-NOBMI-NEXT: mull %ebx +; X86-NOBMI-NEXT: movl %edi, %eax +; X86-NOBMI-NEXT: mull %esi ; X86-NOBMI-NEXT: movl %edx, %ebx +; X86-NOBMI-NEXT: movl %ebp, %eax +; X86-NOBMI-NEXT: mull %esi +; X86-NOBMI-NEXT: movl %edx, %esi ; X86-NOBMI-NEXT: movl %eax, %ebp -; X86-NOBMI-NEXT: addl %edi, %ebp -; X86-NOBMI-NEXT: adcl $0, %ebx -; X86-NOBMI-NEXT: movl %esi, %eax +; X86-NOBMI-NEXT: addl %ebx, %ebp +; X86-NOBMI-NEXT: adcl $0, %esi +; X86-NOBMI-NEXT: movl %edi, %eax ; X86-NOBMI-NEXT: mull %ecx -; X86-NOBMI-NEXT: movl %edx, %esi ; X86-NOBMI-NEXT: addl %ebp, %eax -; X86-NOBMI-NEXT: adcl %ebx, %esi +; X86-NOBMI-NEXT: adcl %edx, %esi ; X86-NOBMI-NEXT: setb %al ; X86-NOBMI-NEXT: movzbl %al, %edi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax Index: llvm/test/CodeGen/X86/iabs.ll =================================================================== --- llvm/test/CodeGen/X86/iabs.ll +++ llvm/test/CodeGen/X86/iabs.ll @@ -152,14 +152,13 @@ ; ; X64-LABEL: test_i128: ; X64: # %bb.0: -; X64-NEXT: movq %rsi, %rdx ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rsi, %rcx -; X64-NEXT: sarq $63, %rcx -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: xorq %rcx, %rax -; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: movq %rsi, %rdx +; X64-NEXT: sarq $63, %rdx +; X64-NEXT: addq %rdx, %rax +; X64-NEXT: adcq %rdx, %rsi +; X64-NEXT: xorq %rdx, %rax +; X64-NEXT: xorq %rsi, %rdx ; X64-NEXT: retq %tmp1neg = sub i128 0, %a %b = icmp sgt i128 %a, -1 Index: llvm/test/CodeGen/X86/imul.ll =================================================================== --- llvm/test/CodeGen/X86/imul.ll +++ llvm/test/CodeGen/X86/imul.ll @@ -217,7 +217,6 @@ define i32 @mul33_32(i32 %A) { ; X64-LABEL: mul33_32: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $5, %eax ; X64-NEXT: addl %edi, %eax @@ -346,7 +345,6 @@ define i32 @test2(i32 %a) { ; X64-LABEL: test2: ; X64: # %bb.0: # %entry -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $5, %eax ; X64-NEXT: addl %edi, %eax @@ -367,7 +365,6 @@ define i32 @test3(i32 %a) { ; X64-LABEL: test3: ; X64: # %bb.0: # %entry -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $5, %eax ; X64-NEXT: addl %edi, %eax Index: llvm/test/CodeGen/X86/midpoint-int-vec-128.ll =================================================================== --- llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -62,8 +62,7 @@ ; SSE41-NEXT: psubd %xmm3, %xmm1 ; SSE41-NEXT: psrld $1, %xmm1 ; SSE41-NEXT: pmulld %xmm1, %xmm2 -; SSE41-NEXT: paddd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: paddd %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-FALLBACK-LABEL: vec128_i32_signed_reg_reg: @@ -555,18 +554,17 @@ ; ; SSE41-LABEL: vec128_i32_signed_reg_mem: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movdqa (%rdi), %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pminsd %xmm2, %xmm3 -; SSE41-NEXT: pmaxsd %xmm0, %xmm2 -; SSE41-NEXT: psubd %xmm3, %xmm2 -; SSE41-NEXT: psrld $1, %xmm2 +; SSE41-NEXT: pminsd %xmm1, %xmm3 +; SSE41-NEXT: pmaxsd %xmm0, %xmm1 +; SSE41-NEXT: psubd %xmm3, %xmm1 +; SSE41-NEXT: psrld $1, %xmm1 ; SSE41-NEXT: pmulld %xmm2, %xmm1 -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: paddd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-FALLBACK-LABEL: vec128_i32_signed_reg_mem: @@ -909,19 +907,18 @@ ; SSE2-NEXT: pandn %xmm1, %xmm3 ; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: psubq %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psrlq $1, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrlq $1, %xmm1 ; SSE2-NEXT: psrlq $33, %xmm3 ; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: psrlq $32, %xmm1 -; SSE2-NEXT: pmuludq %xmm4, %xmm1 -; SSE2-NEXT: paddq %xmm3, %xmm1 -; SSE2-NEXT: psllq $32, %xmm1 -; SSE2-NEXT: pmuludq %xmm2, %xmm4 -; SSE2-NEXT: paddq %xmm0, %xmm1 -; SSE2-NEXT: paddq %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: psrlq $32, %xmm4 +; SSE2-NEXT: pmuludq %xmm1, %xmm4 +; SSE2-NEXT: paddq %xmm3, %xmm4 +; SSE2-NEXT: psllq $32, %xmm4 +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: paddq %xmm4, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: vec128_i64_signed_reg_reg: @@ -960,9 +957,9 @@ ; SSE41-NEXT: pmuludq %xmm3, %xmm0 ; SSE41-NEXT: paddq %xmm1, %xmm0 ; SSE41-NEXT: psllq $32, %xmm0 -; SSE41-NEXT: pmuludq %xmm3, %xmm4 +; SSE41-NEXT: pmuludq %xmm4, %xmm3 ; SSE41-NEXT: paddq %xmm2, %xmm0 -; SSE41-NEXT: paddq %xmm4, %xmm0 +; SSE41-NEXT: paddq %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-FALLBACK-LABEL: vec128_i64_signed_reg_reg: @@ -1136,19 +1133,18 @@ ; SSE2-NEXT: pandn %xmm1, %xmm3 ; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: psubq %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psrlq $1, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrlq $1, %xmm1 ; SSE2-NEXT: psrlq $33, %xmm3 ; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: psrlq $32, %xmm1 -; SSE2-NEXT: pmuludq %xmm4, %xmm1 -; SSE2-NEXT: paddq %xmm3, %xmm1 -; SSE2-NEXT: psllq $32, %xmm1 -; SSE2-NEXT: pmuludq %xmm2, %xmm4 -; SSE2-NEXT: paddq %xmm0, %xmm1 -; SSE2-NEXT: paddq %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: psrlq $32, %xmm4 +; SSE2-NEXT: pmuludq %xmm1, %xmm4 +; SSE2-NEXT: paddq %xmm3, %xmm4 +; SSE2-NEXT: psllq $32, %xmm4 +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: paddq %xmm4, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: vec128_i64_unsigned_reg_reg: @@ -1187,9 +1183,9 @@ ; SSE41-NEXT: pmuludq %xmm3, %xmm0 ; SSE41-NEXT: paddq %xmm1, %xmm0 ; SSE41-NEXT: psllq $32, %xmm0 -; SSE41-NEXT: pmuludq %xmm3, %xmm4 +; SSE41-NEXT: pmuludq %xmm4, %xmm3 ; SSE41-NEXT: paddq %xmm2, %xmm0 -; SSE41-NEXT: paddq %xmm4, %xmm0 +; SSE41-NEXT: paddq %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-FALLBACK-LABEL: vec128_i64_unsigned_reg_reg: @@ -1390,40 +1386,41 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movdqa (%rdi), %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: pxor %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE41-NEXT: movdqa %xmm5, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] ; SSE41-NEXT: pand %xmm4, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE41-NEXT: por %xmm7, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,1] ; SSE41-NEXT: por %xmm2, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 ; SSE41-NEXT: psubq %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlq $1, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlq $1, %xmm2 ; SSE41-NEXT: psrlq $33, %xmm1 ; SSE41-NEXT: pmuludq %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: psrlq $32, %xmm2 -; SSE41-NEXT: pmuludq %xmm0, %xmm2 -; SSE41-NEXT: paddq %xmm1, %xmm2 -; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: pmuludq %xmm4, %xmm0 -; SSE41-NEXT: paddq %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: psrlq $32, %xmm0 +; SSE41-NEXT: pmuludq %xmm2, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: psllq $32, %xmm0 +; SSE41-NEXT: pmuludq %xmm4, %xmm2 +; SSE41-NEXT: paddq %xmm3, %xmm0 ; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -1574,22 +1571,22 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, <2 x i64>* %a2_addr) nounwind { ; SSE2-LABEL: vec128_i64_signed_reg_mem: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] ; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] ; SSE2-NEXT: pand %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1] -; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1] +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSE2-NEXT: pand %xmm7, %xmm5 @@ -1597,26 +1594,25 @@ ; SSE2-NEXT: por %xmm5, %xmm4 ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: pandn %xmm3, %xmm4 ; SSE2-NEXT: por %xmm5, %xmm4 ; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: psubq %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psrlq $1, %xmm4 -; SSE2-NEXT: psrlq $33, %xmm3 -; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: psrlq $32, %xmm1 -; SSE2-NEXT: pmuludq %xmm4, %xmm1 -; SSE2-NEXT: paddq %xmm3, %xmm1 -; SSE2-NEXT: psllq $32, %xmm1 -; SSE2-NEXT: pmuludq %xmm2, %xmm4 -; SSE2-NEXT: paddq %xmm0, %xmm1 -; SSE2-NEXT: paddq %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: psubq %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: psrlq $1, %xmm3 +; SSE2-NEXT: psrlq $33, %xmm2 +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrlq $32, %xmm4 +; SSE2-NEXT: pmuludq %xmm3, %xmm4 +; SSE2-NEXT: paddq %xmm2, %xmm4 +; SSE2-NEXT: psllq $32, %xmm4 +; SSE2-NEXT: pmuludq %xmm1, %xmm3 +; SSE2-NEXT: paddq %xmm4, %xmm0 +; SSE2-NEXT: paddq %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: vec128_i64_signed_reg_mem: @@ -1624,22 +1620,21 @@ ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movdqa (%rdi), %xmm3 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] ; SSE41-NEXT: pand %xmm4, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE41-NEXT: por %xmm7, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,1] ; SSE41-NEXT: por %xmm2, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pand %xmm6, %xmm0 ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm3, %xmm5 @@ -2062,8 +2057,7 @@ ; SSE-NEXT: psubw %xmm3, %xmm1 ; SSE-NEXT: psrlw $1, %xmm1 ; SSE-NEXT: pmullw %xmm1, %xmm2 -; SSE-NEXT: paddw %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-FALLBACK-LABEL: vec128_i16_signed_reg_reg: @@ -2169,22 +2163,21 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) nounwind { ; SSE2-LABEL: vec128_i16_unsigned_reg_reg: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtw %xmm2, %xmm3 -; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubusw %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 +; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psubusw %xmm1, %xmm3 ; SSE2-NEXT: psubusw %xmm0, %xmm1 -; SSE2-NEXT: psubw %xmm0, %xmm2 -; SSE2-NEXT: paddw %xmm1, %xmm2 -; SSE2-NEXT: paddw %xmm0, %xmm2 -; SSE2-NEXT: psrlw $1, %xmm2 -; SSE2-NEXT: pmullw %xmm3, %xmm2 -; SSE2-NEXT: paddw %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psubw %xmm0, %xmm3 +; SSE2-NEXT: paddw %xmm1, %xmm3 +; SSE2-NEXT: paddw %xmm0, %xmm3 +; SSE2-NEXT: psrlw $1, %xmm3 +; SSE2-NEXT: pmullw %xmm2, %xmm3 +; SSE2-NEXT: paddw %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: vec128_i16_unsigned_reg_reg: @@ -2438,18 +2431,17 @@ define <8 x i16> @vec128_i16_signed_reg_mem(<8 x i16> %a1, <8 x i16>* %a2_addr) nounwind { ; SSE-LABEL: vec128_i16_signed_reg_mem: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pcmpgtw %xmm2, %xmm1 -; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pcmpgtw %xmm1, %xmm2 +; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pminsw %xmm2, %xmm3 -; SSE-NEXT: pmaxsw %xmm0, %xmm2 -; SSE-NEXT: psubw %xmm3, %xmm2 -; SSE-NEXT: psrlw $1, %xmm2 +; SSE-NEXT: pminsw %xmm1, %xmm3 +; SSE-NEXT: pmaxsw %xmm0, %xmm1 +; SSE-NEXT: psubw %xmm3, %xmm1 +; SSE-NEXT: psrlw $1, %xmm1 ; SSE-NEXT: pmullw %xmm2, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-FALLBACK-LABEL: vec128_i16_signed_reg_mem: @@ -2699,10 +2691,10 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwind { ; SSE2-LABEL: vec128_i8_signed_reg_reg: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm4 ; SSE2-NEXT: movdqa %xmm0, %xmm5 @@ -2710,26 +2702,25 @@ ; SSE2-NEXT: pandn %xmm1, %xmm4 ; SSE2-NEXT: por %xmm5, %xmm4 ; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm2, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm5, %xmm2 -; SSE2-NEXT: psubb %xmm4, %xmm2 -; SSE2-NEXT: psrlw $1, %xmm2 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: psubb %xmm4, %xmm3 +; SSE2-NEXT: psrlw $1, %xmm3 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: pmullw %xmm1, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: packuswb %xmm4, %xmm2 -; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: vec128_i8_signed_reg_reg: @@ -3447,10 +3438,10 @@ ; SSE2-LABEL: vec128_i8_signed_reg_mem: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm3, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm4 ; SSE2-NEXT: movdqa %xmm0, %xmm5 @@ -3458,26 +3449,25 @@ ; SSE2-NEXT: pandn %xmm3, %xmm4 ; SSE2-NEXT: por %xmm5, %xmm4 ; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm3, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: psubb %xmm4, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: psubb %xmm4, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: pmullw %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: packuswb %xmm4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: vec128_i8_signed_reg_mem: Index: llvm/test/CodeGen/X86/mul-constant-i16.ll =================================================================== --- llvm/test/CodeGen/X86/mul-constant-i16.ll +++ llvm/test/CodeGen/X86/mul-constant-i16.ll @@ -318,7 +318,6 @@ ; ; X64-LABEL: test_mul_by_17: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $4, %eax ; X64-NEXT: addl %edi, %eax Index: llvm/test/CodeGen/X86/mul-constant-i32.ll =================================================================== --- llvm/test/CodeGen/X86/mul-constant-i32.ll +++ llvm/test/CodeGen/X86/mul-constant-i32.ll @@ -487,7 +487,6 @@ ; ; X64-LABEL: test_mul_by_17: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $4, %eax ; X64-NEXT: addl %edi, %eax @@ -1180,7 +1179,6 @@ ; ; X64-SLM-LABEL: test_mul_by_66: ; X64-SLM: # %bb.0: -; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi ; X64-SLM-NEXT: movl %edi, %eax ; X64-SLM-NEXT: shll $6, %eax ; X64-SLM-NEXT: addl %edi, %eax Index: llvm/test/CodeGen/X86/mul-constant-i64.ll =================================================================== --- llvm/test/CodeGen/X86/mul-constant-i64.ll +++ llvm/test/CodeGen/X86/mul-constant-i64.ll @@ -1476,66 +1476,74 @@ define i64 @test_mul_spec(i64 %x) nounwind { ; X86-LABEL: test_mul_spec: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl $9, %ecx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: leal (%ebx,%ebx,8), %edi -; X86-NEXT: addl $42, %ecx -; X86-NEXT: adcl %edx, %edi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: leal (%ebp,%ebp,8), %eax +; X86-NEXT: addl $42, %esi +; X86-NEXT: adcl %eax, %ecx ; X86-NEXT: movl $5, %edx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %edx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: leal (%ebx,%ebx,4), %ebx -; X86-NEXT: addl $2, %esi -; X86-NEXT: adcl %edx, %ebx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %esi -; X86-NEXT: imull %ecx, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: leal (%ebp,%ebp,4), %eax +; X86-NEXT: addl $2, %edi +; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edi +; X86-NEXT: imull %esi, %ebx ; X86-NEXT: addl %ebx, %edx -; X86-NEXT: imull %edi, %esi -; X86-NEXT: addl %esi, %edx +; X86-NEXT: imull %ecx, %edi +; X86-NEXT: addl %edi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X86-NOOPT-LABEL: test_mul_spec: ; X86-NOOPT: # %bb.0: +; X86-NOOPT-NEXT: pushl %ebp ; X86-NOOPT-NEXT: pushl %ebx ; X86-NOOPT-NEXT: pushl %edi ; X86-NOOPT-NEXT: pushl %esi -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NOOPT-NEXT: movl $9, %ecx -; X86-NOOPT-NEXT: movl %esi, %eax +; X86-NOOPT-NEXT: movl %edi, %eax ; X86-NOOPT-NEXT: mull %ecx -; X86-NOOPT-NEXT: movl %eax, %ecx -; X86-NOOPT-NEXT: leal (%ebx,%ebx,8), %edi -; X86-NOOPT-NEXT: addl $42, %ecx -; X86-NOOPT-NEXT: adcl %edx, %edi +; X86-NOOPT-NEXT: movl %eax, %esi +; X86-NOOPT-NEXT: movl %edx, %ecx +; X86-NOOPT-NEXT: leal (%ebp,%ebp,8), %eax +; X86-NOOPT-NEXT: addl $42, %esi +; X86-NOOPT-NEXT: adcl %eax, %ecx ; X86-NOOPT-NEXT: movl $5, %edx -; X86-NOOPT-NEXT: movl %esi, %eax +; X86-NOOPT-NEXT: movl %edi, %eax ; X86-NOOPT-NEXT: mull %edx -; X86-NOOPT-NEXT: movl %eax, %esi -; X86-NOOPT-NEXT: leal (%ebx,%ebx,4), %ebx -; X86-NOOPT-NEXT: addl $2, %esi -; X86-NOOPT-NEXT: adcl %edx, %ebx -; X86-NOOPT-NEXT: movl %ecx, %eax -; X86-NOOPT-NEXT: mull %esi -; X86-NOOPT-NEXT: imull %ecx, %ebx +; X86-NOOPT-NEXT: movl %eax, %edi +; X86-NOOPT-NEXT: movl %edx, %ebx +; X86-NOOPT-NEXT: leal (%ebp,%ebp,4), %eax +; X86-NOOPT-NEXT: addl $2, %edi +; X86-NOOPT-NEXT: adcl %eax, %ebx +; X86-NOOPT-NEXT: movl %esi, %eax +; X86-NOOPT-NEXT: mull %edi +; X86-NOOPT-NEXT: imull %esi, %ebx ; X86-NOOPT-NEXT: addl %ebx, %edx -; X86-NOOPT-NEXT: imull %edi, %esi -; X86-NOOPT-NEXT: addl %esi, %edx +; X86-NOOPT-NEXT: imull %ecx, %edi +; X86-NOOPT-NEXT: addl %edi, %edx ; X86-NOOPT-NEXT: popl %esi ; X86-NOOPT-NEXT: popl %edi ; X86-NOOPT-NEXT: popl %ebx +; X86-NOOPT-NEXT: popl %ebp ; X86-NOOPT-NEXT: retl ; ; X64-HSW-LABEL: test_mul_spec: Index: llvm/test/CodeGen/X86/mul-constant-i8.ll =================================================================== --- llvm/test/CodeGen/X86/mul-constant-i8.ll +++ llvm/test/CodeGen/X86/mul-constant-i8.ll @@ -188,7 +188,6 @@ define i8 @test_mul_by_17(i8 %x) { ; X64-LABEL: test_mul_by_17: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $4, %eax ; X64-NEXT: addl %edi, %eax Index: llvm/test/CodeGen/X86/mul128.ll =================================================================== --- llvm/test/CodeGen/X86/mul128.ll +++ llvm/test/CodeGen/X86/mul128.ll @@ -10,8 +10,8 @@ ; X64-NEXT: imulq %rdi, %rcx ; X64-NEXT: mulq %rdx ; X64-NEXT: addq %rcx, %rdx -; X64-NEXT: imulq %r8, %rsi -; X64-NEXT: addq %rsi, %rdx +; X64-NEXT: imulq %rsi, %r8 +; X64-NEXT: addq %r8, %rdx ; X64-NEXT: retq ; ; X86-LABEL: foo: Index: llvm/test/CodeGen/X86/overflow.ll =================================================================== --- llvm/test/CodeGen/X86/overflow.ll +++ llvm/test/CodeGen/X86/overflow.ll @@ -56,7 +56,8 @@ ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: andl $1, %ecx -; X64-NEXT: leaq (%rcx,%rdx), %rax +; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq %1 = zext i64 %a to i128 Index: llvm/test/CodeGen/X86/palignr.ll =================================================================== --- llvm/test/CodeGen/X86/palignr.ll +++ llvm/test/CodeGen/X86/palignr.ll @@ -167,8 +167,8 @@ ; CHECK-SSE2-LABEL: test9: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; CHECK-SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; CHECK-SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; CHECK-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] ; CHECK-SSE2-NEXT: por %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retl ; Index: llvm/test/CodeGen/X86/phaddsub.ll =================================================================== --- llvm/test/CodeGen/X86/phaddsub.ll +++ llvm/test/CodeGen/X86/phaddsub.ll @@ -565,8 +565,7 @@ ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: pslld $16, %xmm1 -; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: phaddw_single_source4: Index: llvm/test/CodeGen/X86/pmul.ll =================================================================== --- llvm/test/CodeGen/X86/pmul.ll +++ llvm/test/CodeGen/X86/pmul.ll @@ -1062,43 +1062,45 @@ define <4 x i32> @mul_v4i64_zero_upper_left(<4 x i32> %val1, <4 x i64> %val2) { ; SSE2-LABEL: mul_v4i64_zero_upper_left: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pmuludq %xmm2, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pmuludq %xmm2, %xmm4 ; SSE2-NEXT: psrlq $32, %xmm2 ; SSE2-NEXT: pmuludq %xmm0, %xmm2 ; SSE2-NEXT: psllq $32, %xmm2 -; SSE2-NEXT: paddq %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: paddq %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pmuludq %xmm1, %xmm0 ; SSE2-NEXT: psrlq $32, %xmm1 -; SSE2-NEXT: pmuludq %xmm4, %xmm1 -; SSE2-NEXT: psllq $32, %xmm1 -; SSE2-NEXT: paddq %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm3 +; SSE2-NEXT: psllq $32, %xmm3 +; SSE2-NEXT: paddq %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3] +; SSE2-NEXT: movaps %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mul_v4i64_zero_upper_left: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pmuludq %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pmuludq %xmm2, %xmm4 ; SSE41-NEXT: psrlq $32, %xmm2 ; SSE41-NEXT: pmuludq %xmm0, %xmm2 ; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: paddq %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: paddq %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pmuludq %xmm1, %xmm0 ; SSE41-NEXT: psrlq $32, %xmm1 -; SSE41-NEXT: pmuludq %xmm4, %xmm1 -; SSE41-NEXT: psllq $32, %xmm1 -; SSE41-NEXT: paddq %xmm1, %xmm0 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; SSE41-NEXT: pmuludq %xmm1, %xmm3 +; SSE41-NEXT: psllq $32, %xmm3 +; SSE41-NEXT: paddq %xmm0, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3] +; SSE41-NEXT: movaps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: mul_v4i64_zero_upper_left: Index: llvm/test/CodeGen/X86/pmulh.ll =================================================================== --- llvm/test/CodeGen/X86/pmulh.ll +++ llvm/test/CodeGen/X86/pmulh.ll @@ -356,26 +356,27 @@ ; ; SSE41-LABEL: and_mulhuw_v16i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] -; SSE41-NEXT: pand %xmm8, %xmm3 -; SSE41-NEXT: pand %xmm8, %xmm2 -; SSE41-NEXT: pand %xmm8, %xmm1 -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm8, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767,32767,32767] +; SSE41-NEXT: pand %xmm6, %xmm3 +; SSE41-NEXT: pand %xmm6, %xmm2 +; SSE41-NEXT: pand %xmm6, %xmm1 +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: pand %xmm6, %xmm7 ; SSE41-NEXT: pmaddwd %xmm3, %xmm7 -; SSE41-NEXT: pand %xmm8, %xmm6 -; SSE41-NEXT: pmaddwd %xmm2, %xmm6 -; SSE41-NEXT: pand %xmm8, %xmm5 +; SSE41-NEXT: pand %xmm6, %xmm8 +; SSE41-NEXT: pmaddwd %xmm2, %xmm8 +; SSE41-NEXT: pand %xmm6, %xmm5 ; SSE41-NEXT: pmaddwd %xmm1, %xmm5 -; SSE41-NEXT: pand %xmm8, %xmm4 -; SSE41-NEXT: pmaddwd %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pmaddwd %xmm6, %xmm0 ; SSE41-NEXT: psrld $16, %xmm7 -; SSE41-NEXT: psrld $16, %xmm6 -; SSE41-NEXT: packusdw %xmm7, %xmm6 +; SSE41-NEXT: psrld $16, %xmm8 +; SSE41-NEXT: packusdw %xmm7, %xmm8 ; SSE41-NEXT: psrld $16, %xmm5 ; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: packusdw %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm1 +; SSE41-NEXT: movdqa %xmm8, %xmm1 ; SSE41-NEXT: retq ; ; AVX2-LABEL: and_mulhuw_v16i16: Index: llvm/test/CodeGen/X86/popcnt.ll =================================================================== --- llvm/test/CodeGen/X86/popcnt.ll +++ llvm/test/CodeGen/X86/popcnt.ll @@ -27,7 +27,6 @@ ; ; X64-LABEL: cnt8: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shrb %al ; X64-NEXT: andb $85, %al @@ -39,9 +38,8 @@ ; X64-NEXT: addb %al, %dil ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shrb $4, %al -; X64-NEXT: addl %edi, %eax +; X64-NEXT: addb %dil, %al ; X64-NEXT: andb $15, %al -; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X86-POPCNT-LABEL: cnt8: Index: llvm/test/CodeGen/X86/powi.ll =================================================================== --- llvm/test/CodeGen/X86/powi.ll +++ llvm/test/CodeGen/X86/powi.ll @@ -46,8 +46,7 @@ ; X64-NEXT: mulsd %xmm1, %xmm1 ; X64-NEXT: mulsd %xmm1, %xmm0 ; X64-NEXT: mulsd %xmm1, %xmm1 -; X64-NEXT: mulsd %xmm0, %xmm1 -; X64-NEXT: movapd %xmm1, %xmm0 +; X64-NEXT: mulsd %xmm1, %xmm0 ; X64-NEXT: retq %ret = tail call double @llvm.powi.f64.i32(double %a, i32 15) nounwind ; [#uses=1] ret double %ret Index: llvm/test/CodeGen/X86/pr42998.ll =================================================================== --- llvm/test/CodeGen/X86/pr42998.ll +++ llvm/test/CodeGen/X86/pr42998.ll @@ -5,11 +5,9 @@ define i64 @imm1_Oz(i32 %x, i32 %y) minsize nounwind { ; CHECK-LABEL: imm1_Oz: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $esi killed $esi def $rsi -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: leal 1(%rdi), %eax -; CHECK-NEXT: incl %esi -; CHECK-NEXT: addq %rsi, %rax +; CHECK: incl %edi +; CHECK-NEXT: leal 1(%rsi), %eax +; CHECK-NEXT: addq %rdi, %rax ; CHECK-NEXT: retq %x1 = add i32 %x, 1 %y1 = add i32 %y, 1 @@ -22,11 +20,9 @@ define i64 @imm1_Os(i32 %x, i32 %y) optsize nounwind { ; CHECK-LABEL: imm1_Os: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $esi killed $esi def $rsi -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: leal 1(%rdi), %eax -; CHECK-NEXT: incl %esi -; CHECK-NEXT: addq %rsi, %rax +; CHECK: incl %edi +; CHECK-NEXT: leal 1(%rsi), %eax +; CHECK-NEXT: addq %rdi, %rax ; CHECK-NEXT: retq %x1 = add i32 %x, 1 %y1 = add i32 %y, 1 @@ -41,18 +37,18 @@ ; FAST-INCDEC: # %bb.0: ; FAST-INCDEC-NEXT: # kill: def $esi killed $esi def $rsi ; FAST-INCDEC-NEXT: # kill: def $edi killed $edi def $rdi -; FAST-INCDEC-NEXT: leal 1(%rdi), %eax -; FAST-INCDEC-NEXT: incl %esi -; FAST-INCDEC-NEXT: addq %rsi, %rax +; FAST-INCDEC-NEXT: incl %edi +; FAST-INCDEC-NEXT: leal 1(%rsi), %eax +; FAST-INCDEC-NEXT: addq %rdi, %rax ; FAST-INCDEC-NEXT: retq ; ; SLOW-INCDEC-LABEL: imm1_O2: ; SLOW-INCDEC: # %bb.0: -; SLOW-INCDEC-NEXT: # kill: def $esi killed $esi def $rsi ; SLOW-INCDEC-NEXT: # kill: def $edi killed $edi def $rdi -; SLOW-INCDEC-NEXT: leal 1(%rdi), %eax -; SLOW-INCDEC-NEXT: addl $1, %esi -; SLOW-INCDEC-NEXT: addq %rsi, %rax +; SLOW-INCDEC-NEXT: # kill: def $esi killed $esi def $rsi +; SLOW-INCDEC-NEXT: addl $1, %edi +; SLOW-INCDEC-NEXT: leal 1(%rsi), %eax +; SLOW-INCDEC-NEXT: addq %rdi, %rax ; SLOW-INCDEC-NEXT: retq %x1 = add i32 %x, 1 %y1 = add i32 %y, 1 Index: llvm/test/CodeGen/X86/recip-fastmath.ll =================================================================== --- llvm/test/CodeGen/X86/recip-fastmath.ll +++ llvm/test/CodeGen/X86/recip-fastmath.ll @@ -442,11 +442,10 @@ ; ; HASWELL-LABEL: v4f32_one_step: ; HASWELL: # %bb.0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm2 -; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfmsub231ps {{.*#+}} xmm1 = (xmm2 * xmm0) - xmm1 -; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm2 -; HASWELL-NEXT: vmovaps %xmm1, %xmm0 +; HASWELL-NEXT: vrcpps %xmm0, %xmm1 +; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 +; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step: @@ -461,11 +460,10 @@ ; ; KNL-LABEL: v4f32_one_step: ; KNL: # %bb.0: -; KNL-NEXT: vrcpps %xmm0, %xmm2 -; KNL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; KNL-NEXT: vfmsub231ps {{.*#+}} xmm1 = (xmm2 * xmm0) - xmm1 -; KNL-NEXT: vfnmadd132ps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm2 -; KNL-NEXT: vmovaps %xmm1, %xmm0 +; KNL-NEXT: vrcpps %xmm0, %xmm1 +; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; KNL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 +; KNL-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 ; KNL-NEXT: retq ; ; SKX-LABEL: v4f32_one_step: @@ -812,11 +810,10 @@ ; ; HASWELL-LABEL: v8f32_one_step: ; HASWELL: # %bb.0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm2 -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfmsub231ps {{.*#+}} ymm1 = (ymm2 * ymm0) - ymm1 -; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2 -; HASWELL-NEXT: vmovaps %ymm1, %ymm0 +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 +; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 +; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step: @@ -831,11 +828,10 @@ ; ; KNL-LABEL: v8f32_one_step: ; KNL: # %bb.0: -; KNL-NEXT: vrcpps %ymm0, %ymm2 -; KNL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; KNL-NEXT: vfmsub231ps {{.*#+}} ymm1 = (ymm2 * ymm0) - ymm1 -; KNL-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2 -; KNL-NEXT: vmovaps %ymm1, %ymm0 +; KNL-NEXT: vrcpps %ymm0, %ymm1 +; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; KNL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 +; KNL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1 ; KNL-NEXT: retq ; ; SKX-LABEL: v8f32_one_step: Index: llvm/test/CodeGen/X86/rev16.ll =================================================================== --- llvm/test/CodeGen/X86/rev16.ll +++ llvm/test/CodeGen/X86/rev16.ll @@ -40,13 +40,12 @@ ; ; X64-LABEL: not_rev16: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $8, %eax ; X64-NEXT: shrl $8, %edi ; X64-NEXT: andl $65280, %edi # imm = 0xFF00 ; X64-NEXT: andl $16711680, %eax # imm = 0xFF0000 -; X64-NEXT: addl %edi, %eax +; X64-NEXT: orl %edi, %eax ; X64-NEXT: retq %l8 = shl i32 %a, 8 %r8 = lshr i32 %a, 8 @@ -128,13 +127,12 @@ ; ; X64-LABEL: different_shift_amount: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $9, %eax ; X64-NEXT: shrl $8, %edi ; X64-NEXT: andl $-16712192, %eax # imm = 0xFF00FE00 ; X64-NEXT: andl $16711935, %edi # imm = 0xFF00FF -; X64-NEXT: addl %edi, %eax +; X64-NEXT: orl %edi, %eax ; X64-NEXT: retq %l8 = shl i32 %a, 9 %r8 = lshr i32 %a, 8 Index: llvm/test/CodeGen/X86/rot16.ll =================================================================== --- llvm/test/CodeGen/X86/rot16.ll +++ llvm/test/CodeGen/X86/rot16.ll @@ -220,10 +220,9 @@ ; X64-LABEL: rot16_trunc: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: shrl $11, %ecx -; X64-NEXT: shll $5, %eax -; X64-NEXT: orl %ecx, %eax +; X64-NEXT: shrl $11, %eax +; X64-NEXT: shll $5, %edi +; X64-NEXT: orl %edi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %t0 = lshr i32 %x, 11 Index: llvm/test/CodeGen/X86/rotate-extract.ll =================================================================== --- llvm/test/CodeGen/X86/rotate-extract.ll +++ llvm/test/CodeGen/X86/rotate-extract.ll @@ -156,7 +156,7 @@ ; X64-NEXT: shlq $5, %rax ; X64-NEXT: shlq $10, %rdi ; X64-NEXT: shrq $57, %rax -; X64-NEXT: addq %rdi, %rax +; X64-NEXT: orq %rdi, %rax ; X64-NEXT: retq %lhs_mul = shl i64 %i, 5 %rhs_mul = shl i64 %i, 10 @@ -179,12 +179,11 @@ ; ; X64-LABEL: no_extract_shrl: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: andl $-8, %eax ; X64-NEXT: shll $25, %eax ; X64-NEXT: shrl $9, %edi -; X64-NEXT: addl %edi, %eax +; X64-NEXT: orl %edi, %eax ; X64-NEXT: retq %lhs_div = lshr i32 %i, 3 %rhs_div = lshr i32 %i, 9 Index: llvm/test/CodeGen/X86/rotate-multi.ll =================================================================== --- llvm/test/CodeGen/X86/rotate-multi.ll +++ llvm/test/CodeGen/X86/rotate-multi.ll @@ -6,10 +6,9 @@ ; CHECK-LABEL: f0: ; CHECK: # %bb.0: # %b0 ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: roll $7, %ecx -; CHECK-NEXT: roll $9, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: roll $7, %eax +; CHECK-NEXT: roll $9, %edi +; CHECK-NEXT: orl %edi, %eax ; CHECK-NEXT: retq b0: %v0 = shl i32 %a0, 7 @@ -27,11 +26,10 @@ ; CHECK-LABEL: f1: ; CHECK: # %bb.0: # %b0 ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: shll $7, %ecx -; CHECK-NEXT: roll $9, %eax -; CHECK-NEXT: orl %esi, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: shll $7, %eax +; CHECK-NEXT: roll $9, %edi +; CHECK-NEXT: orl %esi, %edi +; CHECK-NEXT: orl %edi, %eax ; CHECK-NEXT: retq b0: %v0 = shl i32 %a0, 7 @@ -49,15 +47,14 @@ define i32 @f2(i32 %a0, i32 %a1) #0 { ; CHECK-LABEL: f2: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: movl %edi, %ecx ; CHECK-NEXT: shll $11, %ecx ; CHECK-NEXT: shrl $21, %edi -; CHECK-NEXT: movl %esi, %edx -; CHECK-NEXT: shll $19, %edx -; CHECK-NEXT: shrl $13, %eax -; CHECK-NEXT: orl %edi, %eax -; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: shll $19, %eax +; CHECK-NEXT: shrl $13, %esi +; CHECK-NEXT: orl %edi, %esi +; CHECK-NEXT: orl %esi, %eax ; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: retq %v0 = shl i32 %a0, 11 @@ -76,33 +73,33 @@ ; CHECK-LABEL: f3: ; CHECK: # %bb.0: # %b0 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: leal (,%rdi,8), %eax -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: shll $5, %ecx +; CHECK-NEXT: leal (,%rdi,8), %ecx +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shll $5, %eax ; CHECK-NEXT: movl %edi, %edx ; CHECK-NEXT: shll $7, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: shll $13, %ecx -; CHECK-NEXT: orl %edx, %ecx +; CHECK-NEXT: orl %eax, %edx +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shll $13, %eax +; CHECK-NEXT: orl %edx, %eax ; CHECK-NEXT: movl %edi, %edx ; CHECK-NEXT: shll $19, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: shrl $2, %ecx -; CHECK-NEXT: orl %edx, %ecx +; CHECK-NEXT: orl %eax, %edx +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shrl $2, %eax +; CHECK-NEXT: orl %edx, %eax ; CHECK-NEXT: movl %edi, %edx ; CHECK-NEXT: shrl $15, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: shrl $23, %ecx -; CHECK-NEXT: orl %edx, %ecx -; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: shrl $25, %edx -; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: orl %eax, %edx +; CHECK-NEXT: movl %edi, %esi +; CHECK-NEXT: shrl $23, %esi +; CHECK-NEXT: orl %edx, %esi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shrl $25, %eax +; CHECK-NEXT: orl %esi, %eax ; CHECK-NEXT: shrl $30, %edi -; CHECK-NEXT: orl %edx, %edi ; CHECK-NEXT: orl %edi, %eax +; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: retq b0: %v0 = shl i32 %a0, 3 Index: llvm/test/CodeGen/X86/sat-add.ll =================================================================== --- llvm/test/CodeGen/X86/sat-add.ll +++ llvm/test/CodeGen/X86/sat-add.ll @@ -230,7 +230,6 @@ define i16 @unsigned_sat_variable_i16_using_min(i16 %x, i16 %y) { ; ANY-LABEL: unsigned_sat_variable_i16_using_min: ; ANY: # %bb.0: -; ANY-NEXT: # kill: def $esi killed $esi def $rsi ; ANY-NEXT: movl %esi, %eax ; ANY-NEXT: notl %eax ; ANY-NEXT: cmpw %ax, %di @@ -277,7 +276,6 @@ define i32 @unsigned_sat_variable_i32_using_min(i32 %x, i32 %y) { ; ANY-LABEL: unsigned_sat_variable_i32_using_min: ; ANY: # %bb.0: -; ANY-NEXT: # kill: def $esi killed $esi def $rsi ; ANY-NEXT: movl %esi, %eax ; ANY-NEXT: notl %eax ; ANY-NEXT: cmpl %eax, %edi @@ -717,14 +715,13 @@ ; ; SSE42-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pxor %xmm2, %xmm1 +; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pxor %xmm1, %xmm2 ; SSE42-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE42-NEXT: pxor %xmm0, %xmm2 -; SSE42-NEXT: pcmpgtq %xmm2, %xmm1 -; SSE42-NEXT: por %xmm0, %xmm1 -; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: pxor %xmm0, %xmm1 +; SSE42-NEXT: pcmpgtq %xmm1, %xmm2 +; SSE42-NEXT: por %xmm2, %xmm0 ; SSE42-NEXT: retq ; ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum: @@ -785,14 +782,13 @@ ; ; SSE42-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pxor %xmm2, %xmm1 +; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pxor %xmm1, %xmm2 ; SSE42-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE42-NEXT: pxor %xmm0, %xmm2 -; SSE42-NEXT: pcmpgtq %xmm2, %xmm1 -; SSE42-NEXT: por %xmm0, %xmm1 -; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: pxor %xmm0, %xmm1 +; SSE42-NEXT: pcmpgtq %xmm1, %xmm2 +; SSE42-NEXT: por %xmm2, %xmm0 ; SSE42-NEXT: retq ; ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: Index: llvm/test/CodeGen/X86/sdiv_fix.ll =================================================================== --- llvm/test/CodeGen/X86/sdiv_fix.ll +++ llvm/test/CodeGen/X86/sdiv_fix.ll @@ -277,12 +277,13 @@ ; X64-NEXT: movq %r14, %rdx ; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __divti3@PLT -; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill +; X64-NEXT: leaq -1(%rax), %rbp ; X64-NEXT: testq %rbx, %rbx ; X64-NEXT: sets %al ; X64-NEXT: testq %r12, %r12 -; X64-NEXT: sets %bpl -; X64-NEXT: xorb %al, %bpl +; X64-NEXT: sets %r13b +; X64-NEXT: xorb %al, %r13b ; X64-NEXT: movq %r15, %rdi ; X64-NEXT: movq %rbx, %rsi ; X64-NEXT: movq %r14, %rdx @@ -290,10 +291,9 @@ ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al -; X64-NEXT: testb %bpl, %al -; X64-NEXT: leaq -1(%r13), %rax -; X64-NEXT: cmovneq %rax, %r13 -; X64-NEXT: movq %r13, %rax +; X64-NEXT: testb %r13b, %al +; X64-NEXT: cmoveq (%rsp), %rbp # 8-byte Folded Reload +; X64-NEXT: movq %rbp, %rax ; X64-NEXT: addq $8, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 Index: llvm/test/CodeGen/X86/select-constant-xor.ll =================================================================== --- llvm/test/CodeGen/X86/select-constant-xor.ll +++ llvm/test/CodeGen/X86/select-constant-xor.ll @@ -135,7 +135,6 @@ define i32 @oneusecmp(i32 %a, i32 %b, i32 %d) { ; CHECK-LABEL: oneusecmp: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: sarl $31, %eax ; CHECK-NEXT: xorl $127, %eax Index: llvm/test/CodeGen/X86/select.ll =================================================================== --- llvm/test/CodeGen/X86/select.ll +++ llvm/test/CodeGen/X86/select.ll @@ -1548,10 +1548,9 @@ define i64 @PR51612(i64 %x, i64 %y) { ; CHECK-LABEL: PR51612: ; CHECK: ## %bb.0: -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: incl %esi -; CHECK-NEXT: incq %rax -; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: leal 1(%rsi), %eax +; CHECK-NEXT: incq %rdi +; CHECK-NEXT: cmovnel %edi, %eax ; CHECK-NEXT: andl 10, %eax ; CHECK-NEXT: retq ; Index: llvm/test/CodeGen/X86/shift-logic.ll =================================================================== --- llvm/test/CodeGen/X86/shift-logic.ll +++ llvm/test/CodeGen/X86/shift-logic.ll @@ -4,10 +4,10 @@ define i8 @shl_and(i8 %x, i8 %y) nounwind { ; CHECK-LABEL: shl_and: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shlb $2, %sil -; CHECK-NEXT: shlb $5, %al -; CHECK-NEXT: andb %sil, %al +; CHECK-NEXT: # kill: def $esi killed $esi def $rsi +; CHECK-NEXT: leal (,%rsi,4), %eax +; CHECK-NEXT: shlb $5, %dil +; CHECK-NEXT: andb %dil, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq %sh0 = shl i8 %x, 3 Index: llvm/test/CodeGen/X86/smax.ll =================================================================== --- llvm/test/CodeGen/X86/smax.ll +++ llvm/test/CodeGen/X86/smax.ll @@ -141,12 +141,12 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind { ; X64-LABEL: test_i128: ; X64: # %bb.0: -; X64-NEXT: movq %rdx, %rax ; X64-NEXT: cmpq %rdx, %rdi -; X64-NEXT: cmovaq %rdi, %rdx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: cmovaq %rdi, %rax ; X64-NEXT: cmpq %rcx, %rsi -; X64-NEXT: cmovgq %rdi, %rax -; X64-NEXT: cmoveq %rdx, %rax +; X64-NEXT: cmovgq %rdi, %rdx +; X64-NEXT: cmovneq %rdx, %rax ; X64-NEXT: cmovgq %rsi, %rcx ; X64-NEXT: movq %rcx, %rdx ; X64-NEXT: retq @@ -238,8 +238,7 @@ ; SSE-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2i32: @@ -271,8 +270,7 @@ ; SSE-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v3i32: @@ -312,8 +310,7 @@ ; SSE-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4i32: @@ -360,14 +357,12 @@ ; SSE-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v8i32: @@ -528,8 +523,7 @@ ; SSE-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v16i8: Index: llvm/test/CodeGen/X86/smin.ll =================================================================== --- llvm/test/CodeGen/X86/smin.ll +++ llvm/test/CodeGen/X86/smin.ll @@ -141,12 +141,12 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind { ; X64-LABEL: test_i128: ; X64: # %bb.0: -; X64-NEXT: movq %rdx, %rax ; X64-NEXT: cmpq %rdx, %rdi -; X64-NEXT: cmovbq %rdi, %rdx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: cmovbq %rdi, %rax ; X64-NEXT: cmpq %rcx, %rsi -; X64-NEXT: cmovlq %rdi, %rax -; X64-NEXT: cmoveq %rdx, %rax +; X64-NEXT: cmovlq %rdi, %rdx +; X64-NEXT: cmovneq %rdx, %rax ; X64-NEXT: cmovlq %rsi, %rcx ; X64-NEXT: movq %rcx, %rdx ; X64-NEXT: retq Index: llvm/test/CodeGen/X86/smul_fix_sat.ll =================================================================== --- llvm/test/CodeGen/X86/smul_fix_sat.ll +++ llvm/test/CodeGen/X86/smul_fix_sat.ll @@ -65,7 +65,6 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %esi, %ebx ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %ecx, %eax @@ -73,61 +72,59 @@ ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %edi, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %esi, %eax -; X86-NEXT: imull %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: imull %esi ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ecx, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: adcl %ebp, %edx ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl %edi, %edx +; X86-NEXT: addl %esi, %edx ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: subl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: sbbl $0, %ebp -; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnsl %ebx, %ebp -; X86-NEXT: cmovnsl %edx, %edi -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: cmovnsl %edx, %esi +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebp, %edx -; X86-NEXT: sbbl $0, %edx +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: sbbl $0, %edi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebp, %edx -; X86-NEXT: cmovnsl %edi, %ecx -; X86-NEXT: testl %edx, %edx +; X86-NEXT: cmovnsl %ebp, %edi +; X86-NEXT: cmovnsl %esi, %ecx +; X86-NEXT: testl %edi, %edi ; X86-NEXT: setg %bl ; X86-NEXT: sete %bh ; X86-NEXT: cmpl $2, %ecx -; X86-NEXT: setae %al -; X86-NEXT: andb %bh, %al -; X86-NEXT: orb %bl, %al -; X86-NEXT: movl (%esp), %ebx # 4-byte Reload -; X86-NEXT: shrdl $2, %esi, %ebx -; X86-NEXT: shrdl $2, %ecx, %esi -; X86-NEXT: testb %al, %al -; X86-NEXT: movl $2147483647, %edi # imm = 0x7FFFFFFF -; X86-NEXT: cmovel %esi, %edi -; X86-NEXT: movl $-1, %eax -; X86-NEXT: cmovnel %eax, %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: cmpl $-1, %edx +; X86-NEXT: setae %dl +; X86-NEXT: andb %bh, %dl +; X86-NEXT: orb %bl, %dl +; X86-NEXT: movl (%esp), %ebx +; X86-NEXT: shrdl $2, %eax, %ebx +; X86-NEXT: shrdl $2, %ecx, %eax +; X86-NEXT: testb %dl, %dl +; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF +; X86-NEXT: cmovel %eax, %esi +; X86-NEXT: movl $-1, %edx +; X86-NEXT: cmovel %ebx, %edx +; X86-NEXT: cmpl $-1, %edi ; X86-NEXT: setl %bl -; X86-NEXT: sete %dl +; X86-NEXT: sete %al ; X86-NEXT: cmpl $-2, %ecx ; X86-NEXT: setb %cl -; X86-NEXT: andb %dl, %cl -; X86-NEXT: xorl %edx, %edx +; X86-NEXT: andb %al, %cl +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: orb %bl, %cl -; X86-NEXT: cmovnel %edx, %eax +; X86-NEXT: cmovel %edx, %eax ; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000 -; X86-NEXT: cmovel %edi, %edx +; X86-NEXT: cmovel %esi, %edx ; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -378,45 +375,45 @@ ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: imull %ebx, %edi -; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: addl %edi, %edx -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: imull %ebp, %ebx -; X86-NEXT: addl %edx, %ebx +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: imull %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %esi -; X86-NEXT: addl %ebp, %edx -; X86-NEXT: imull %esi, %edi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: imull %edi, %esi +; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: addl %esi, %edx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: imull %ebx, %edi ; X86-NEXT: addl %edx, %edi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: imull %ecx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: imull %ebp, %esi +; X86-NEXT: addl %edx, %esi ; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebp, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebp, %edi ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %esi, %eax +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %ebx, %ebp ; X86-NEXT: setb %bl @@ -424,10 +421,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: mull %edx ; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movzbl %bl, %esi -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movzbl %bl, %edi ; X86-NEXT: adcl %edi, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl %esi, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: movl %ebx, %edi ; X86-NEXT: sarl $31, %edi @@ -439,9 +436,9 @@ ; X86-NEXT: xorl $2147483647, %esi # imm = 0x7FFFFFFF ; X86-NEXT: orl %edx, %edi ; X86-NEXT: notl %ecx -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: cmovnel %ecx, %eax +; X86-NEXT: cmovel (%esp), %ecx # 4-byte Folded Reload ; X86-NEXT: cmovel %ebx, %esi +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl %esi, %edx ; X86-NEXT: addl $12, %esp ; X86-NEXT: .cfi_def_cfa_offset 20 @@ -677,18 +674,18 @@ ; X86-NEXT: andb %bh, %bl ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload ; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF -; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: movl $-1, %esi -; X86-NEXT: cmovnel %esi, %eax +; X86-NEXT: cmovel %edx, %esi +; X86-NEXT: movl $-1, %edx +; X86-NEXT: cmovnel %edx, %eax ; X86-NEXT: cmpl $-1, %ecx ; X86-NEXT: setl %cl -; X86-NEXT: sete %ch -; X86-NEXT: andb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: orb %cl, %ch -; X86-NEXT: cmovnel %esi, %eax -; X86-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 -; X86-NEXT: cmovnel %ecx, %edx +; X86-NEXT: sete %dl +; X86-NEXT: andb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload +; X86-NEXT: xorl %edi, %edi +; X86-NEXT: orb %cl, %dl +; X86-NEXT: cmovnel %edi, %eax +; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000 +; X86-NEXT: cmovel %esi, %edx ; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -761,14 +758,14 @@ ; X86-NEXT: shrdl $31, %ecx, %edx ; X86-NEXT: cmpl $1073741824, %ecx # imm = 0x40000000 ; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF -; X86-NEXT: cmovgel %esi, %edx -; X86-NEXT: movl $-1, %esi -; X86-NEXT: cmovgel %esi, %eax -; X86-NEXT: xorl %esi, %esi +; X86-NEXT: cmovll %edx, %esi +; X86-NEXT: movl $-1, %edx +; X86-NEXT: cmovgel %edx, %eax +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: cmpl $-1073741824, %ecx # imm = 0xC0000000 -; X86-NEXT: cmovll %esi, %eax -; X86-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 -; X86-NEXT: cmovll %ecx, %edx +; X86-NEXT: cmovll %edx, %eax +; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000 +; X86-NEXT: cmovgel %esi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx Index: llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll =================================================================== --- llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll +++ llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll @@ -31,17 +31,17 @@ define <4 x float> @v4f32_no_daz(<4 x float> %f) #0 { ; NHM-LABEL: v4f32_no_daz: ; NHM: # %bb.0: -; NHM-NEXT: rsqrtps %xmm0, %xmm2 -; NHM-NEXT: movaps %xmm0, %xmm1 -; NHM-NEXT: mulps %xmm2, %xmm1 +; NHM-NEXT: rsqrtps %xmm0, %xmm1 +; NHM-NEXT: movaps %xmm0, %xmm2 +; NHM-NEXT: mulps %xmm1, %xmm2 ; NHM-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; NHM-NEXT: mulps %xmm1, %xmm3 -; NHM-NEXT: mulps %xmm2, %xmm1 -; NHM-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; NHM-NEXT: mulps %xmm2, %xmm3 +; NHM-NEXT: mulps %xmm1, %xmm2 +; NHM-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; NHM-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; NHM-NEXT: mulps %xmm3, %xmm1 -; NHM-NEXT: movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] -; NHM-NEXT: cmpleps %xmm0, %xmm2 +; NHM-NEXT: mulps %xmm3, %xmm2 +; NHM-NEXT: movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; NHM-NEXT: cmpleps %xmm0, %xmm1 ; NHM-NEXT: andps %xmm2, %xmm1 ; NHM-NEXT: movaps %xmm1, %xmm0 ; NHM-NEXT: retq @@ -87,32 +87,33 @@ define <8 x float> @v8f32_no_daz(<8 x float> %f) #0 { ; NHM-LABEL: v8f32_no_daz: ; NHM: # %bb.0: -; NHM-NEXT: movaps %xmm0, %xmm2 -; NHM-NEXT: rsqrtps %xmm0, %xmm3 -; NHM-NEXT: mulps %xmm3, %xmm0 -; NHM-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; NHM-NEXT: movaps %xmm0, %xmm5 -; NHM-NEXT: mulps %xmm4, %xmm5 -; NHM-NEXT: mulps %xmm3, %xmm0 -; NHM-NEXT: movaps {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; NHM-NEXT: addps %xmm3, %xmm0 -; NHM-NEXT: mulps %xmm5, %xmm0 -; NHM-NEXT: movaps {{.*#+}} xmm5 = [NaN,NaN,NaN,NaN] -; NHM-NEXT: andps %xmm5, %xmm2 -; NHM-NEXT: movaps {{.*#+}} xmm6 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] -; NHM-NEXT: movaps %xmm6, %xmm7 -; NHM-NEXT: cmpleps %xmm2, %xmm7 -; NHM-NEXT: andps %xmm7, %xmm0 -; NHM-NEXT: rsqrtps %xmm1, %xmm7 -; NHM-NEXT: movaps %xmm1, %xmm2 -; NHM-NEXT: mulps %xmm7, %xmm2 +; NHM-NEXT: rsqrtps %xmm0, %xmm2 +; NHM-NEXT: movaps %xmm0, %xmm4 ; NHM-NEXT: mulps %xmm2, %xmm4 -; NHM-NEXT: mulps %xmm7, %xmm2 -; NHM-NEXT: addps %xmm3, %xmm2 -; NHM-NEXT: mulps %xmm4, %xmm2 -; NHM-NEXT: andps %xmm5, %xmm1 -; NHM-NEXT: cmpleps %xmm1, %xmm6 -; NHM-NEXT: andps %xmm6, %xmm2 +; NHM-NEXT: movaps {{.*#+}} xmm5 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; NHM-NEXT: movaps %xmm4, %xmm3 +; NHM-NEXT: mulps %xmm5, %xmm3 +; NHM-NEXT: mulps %xmm2, %xmm4 +; NHM-NEXT: movaps {{.*#+}} xmm6 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; NHM-NEXT: addps %xmm6, %xmm4 +; NHM-NEXT: mulps %xmm3, %xmm4 +; NHM-NEXT: movaps {{.*#+}} xmm7 = [NaN,NaN,NaN,NaN] +; NHM-NEXT: andps %xmm7, %xmm0 +; NHM-NEXT: movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; NHM-NEXT: movaps %xmm2, %xmm3 +; NHM-NEXT: cmpleps %xmm0, %xmm3 +; NHM-NEXT: andps %xmm4, %xmm3 +; NHM-NEXT: rsqrtps %xmm1, %xmm0 +; NHM-NEXT: movaps %xmm1, %xmm4 +; NHM-NEXT: mulps %xmm0, %xmm4 +; NHM-NEXT: mulps %xmm4, %xmm5 +; NHM-NEXT: mulps %xmm0, %xmm4 +; NHM-NEXT: addps %xmm6, %xmm4 +; NHM-NEXT: mulps %xmm5, %xmm4 +; NHM-NEXT: andps %xmm7, %xmm1 +; NHM-NEXT: cmpleps %xmm1, %xmm2 +; NHM-NEXT: andps %xmm4, %xmm2 +; NHM-NEXT: movaps %xmm3, %xmm0 ; NHM-NEXT: movaps %xmm2, %xmm1 ; NHM-NEXT: retq ; Index: llvm/test/CodeGen/X86/sqrt-fastmath.ll =================================================================== --- llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -283,17 +283,17 @@ define <4 x float> @sqrt_v4f32_check_denorms_ninf(<4 x float> %x) #3 { ; SSE-LABEL: sqrt_v4f32_check_denorms_ninf: ; SSE: # %bb.0: -; SSE-NEXT: rsqrtps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm2, %xmm1 +; SSE-NEXT: rsqrtps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; SSE-NEXT: mulps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: mulps %xmm2, %xmm3 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: mulps %xmm3, %xmm1 -; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] -; SSE-NEXT: cmpleps %xmm0, %xmm2 +; SSE-NEXT: mulps %xmm3, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; SSE-NEXT: cmpleps %xmm0, %xmm1 ; SSE-NEXT: andps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq @@ -632,9 +632,8 @@ ; SSE-NEXT: mulss %xmm2, %xmm1 ; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: mulss %xmm0, %xmm2 -; SSE-NEXT: mulss %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: mulss %xmm2, %xmm0 +; SSE-NEXT: mulss %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: div_sqrt_fabs_f32: @@ -813,9 +812,8 @@ ; SSE-NEXT: mulss %xmm1, %xmm2 ; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: mulss %xmm0, %xmm1 -; SSE-NEXT: mulss %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: div_sqrt_f32: Index: llvm/test/CodeGen/X86/sse-minmax.ll =================================================================== --- llvm/test/CodeGen/X86/sse-minmax.ll +++ llvm/test/CodeGen/X86/sse-minmax.ll @@ -103,8 +103,7 @@ ; STRICT-NEXT: cmplesd %xmm1, %xmm2 ; STRICT-NEXT: andpd %xmm2, %xmm0 ; STRICT-NEXT: andnpd %xmm1, %xmm2 -; STRICT-NEXT: orpd %xmm0, %xmm2 -; STRICT-NEXT: movapd %xmm2, %xmm0 +; STRICT-NEXT: orpd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ole: @@ -261,11 +260,10 @@ define double @ole_x(double %x) { ; STRICT-LABEL: ole_x: ; STRICT: # %bb.0: -; STRICT-NEXT: xorpd %xmm2, %xmm2 -; STRICT-NEXT: movapd %xmm0, %xmm1 -; STRICT-NEXT: cmplesd %xmm2, %xmm1 -; STRICT-NEXT: andpd %xmm0, %xmm1 -; STRICT-NEXT: movapd %xmm1, %xmm0 +; STRICT-NEXT: xorpd %xmm1, %xmm1 +; STRICT-NEXT: movapd %xmm0, %xmm2 +; STRICT-NEXT: cmplesd %xmm1, %xmm2 +; STRICT-NEXT: andpd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ole_x: @@ -338,8 +336,7 @@ ; STRICT-NEXT: cmpnlesd %xmm1, %xmm2 ; STRICT-NEXT: andpd %xmm2, %xmm0 ; STRICT-NEXT: andnpd %xmm1, %xmm2 -; STRICT-NEXT: orpd %xmm0, %xmm2 -; STRICT-NEXT: movapd %xmm2, %xmm0 +; STRICT-NEXT: orpd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ugt: @@ -499,11 +496,10 @@ define double @ugt_x(double %x) { ; STRICT-LABEL: ugt_x: ; STRICT: # %bb.0: -; STRICT-NEXT: xorpd %xmm2, %xmm2 -; STRICT-NEXT: movapd %xmm0, %xmm1 -; STRICT-NEXT: cmpnlesd %xmm2, %xmm1 -; STRICT-NEXT: andpd %xmm0, %xmm1 -; STRICT-NEXT: movapd %xmm1, %xmm0 +; STRICT-NEXT: xorpd %xmm1, %xmm1 +; STRICT-NEXT: movapd %xmm0, %xmm2 +; STRICT-NEXT: cmpnlesd %xmm1, %xmm2 +; STRICT-NEXT: andpd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ugt_x: @@ -762,13 +758,12 @@ define double @ole_y(double %x) { ; STRICT-LABEL: ole_y: ; STRICT: # %bb.0: -; STRICT-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; STRICT-NEXT: movapd %xmm0, %xmm1 -; STRICT-NEXT: cmplesd %xmm2, %xmm1 -; STRICT-NEXT: andpd %xmm1, %xmm0 -; STRICT-NEXT: andnpd %xmm2, %xmm1 -; STRICT-NEXT: orpd %xmm0, %xmm1 -; STRICT-NEXT: movapd %xmm1, %xmm0 +; STRICT-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; STRICT-NEXT: movapd %xmm0, %xmm2 +; STRICT-NEXT: cmplesd %xmm1, %xmm2 +; STRICT-NEXT: andpd %xmm2, %xmm0 +; STRICT-NEXT: andnpd %xmm1, %xmm2 +; STRICT-NEXT: orpd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ole_y: @@ -839,13 +834,12 @@ define double @ugt_y(double %x) { ; STRICT-LABEL: ugt_y: ; STRICT: # %bb.0: -; STRICT-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; STRICT-NEXT: movapd %xmm0, %xmm1 -; STRICT-NEXT: cmpnlesd %xmm2, %xmm1 -; STRICT-NEXT: andpd %xmm1, %xmm0 -; STRICT-NEXT: andnpd %xmm2, %xmm1 -; STRICT-NEXT: orpd %xmm0, %xmm1 -; STRICT-NEXT: movapd %xmm1, %xmm0 +; STRICT-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; STRICT-NEXT: movapd %xmm0, %xmm2 +; STRICT-NEXT: cmpnlesd %xmm1, %xmm2 +; STRICT-NEXT: andpd %xmm2, %xmm0 +; STRICT-NEXT: andnpd %xmm1, %xmm2 +; STRICT-NEXT: orpd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ugt_y: Index: llvm/test/CodeGen/X86/sshl_sat.ll =================================================================== --- llvm/test/CodeGen/X86/sshl_sat.ll +++ llvm/test/CodeGen/X86/sshl_sat.ll @@ -205,18 +205,18 @@ ; X64-LABEL: func5: ; X64: # %bb.0: ; X64-NEXT: movq %rsi, %rcx -; X64-NEXT: xorl %edx, %edx +; X64-NEXT: xorl %eax, %eax ; X64-NEXT: testq %rdi, %rdi -; X64-NEXT: sets %dl -; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF -; X64-NEXT: addq %rdx, %rax -; X64-NEXT: movq %rdi, %rdx -; X64-NEXT: shlq %cl, %rdx -; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: sets %al +; X64-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF +; X64-NEXT: addq %rax, %rdx +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shlq %cl, %rax +; X64-NEXT: movq %rax, %rsi ; X64-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NEXT: sarq %cl, %rsi ; X64-NEXT: cmpq %rsi, %rdi -; X64-NEXT: cmoveq %rdx, %rax +; X64-NEXT: cmovneq %rdx, %rax ; X64-NEXT: retq ; ; X86-LABEL: func5: Index: llvm/test/CodeGen/X86/ssub_sat.ll =================================================================== --- llvm/test/CodeGen/X86/ssub_sat.ll +++ llvm/test/CodeGen/X86/ssub_sat.ll @@ -207,19 +207,18 @@ ; ; X64-LABEL: vec: ; X64: # %bb.0: -; X64-NEXT: pxor %xmm3, %xmm3 -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: psubd %xmm1, %xmm2 -; X64-NEXT: pcmpgtd %xmm3, %xmm1 -; X64-NEXT: pcmpgtd %xmm2, %xmm0 +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: movdqa %xmm0, %xmm3 +; X64-NEXT: psubd %xmm1, %xmm3 +; X64-NEXT: pcmpgtd %xmm2, %xmm1 +; X64-NEXT: pcmpgtd %xmm3, %xmm0 ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: pandn %xmm2, %xmm1 -; X64-NEXT: psrad $31, %xmm2 -; X64-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; X64-NEXT: pand %xmm0, %xmm2 -; X64-NEXT: por %xmm1, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: pandn %xmm3, %xmm1 +; X64-NEXT: psrad $31, %xmm3 +; X64-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-NEXT: pand %xmm3, %xmm0 +; X64-NEXT: por %xmm1, %xmm0 ; X64-NEXT: retq %tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %tmp Index: llvm/test/CodeGen/X86/ssub_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -612,36 +612,34 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE2-LABEL: v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubd %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psubd %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v2i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: psubd %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: psubd %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pandn %xmm3, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i32: @@ -715,36 +713,34 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-LABEL: v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubd %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psubd %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: psubd %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: psubd %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pandn %xmm3, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v4i32: @@ -818,60 +814,58 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; SSE2-LABEL: v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: psubd %xmm2, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psubd %xmm2, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm5, %xmm2 +; SSE2-NEXT: psrad $31, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: pxor %xmm6, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psubd %xmm3, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: pxor %xmm6, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: psubd %xmm2, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: pandn %xmm0, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: psubd %xmm2, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pandn %xmm5, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm5 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: pxor %xmm6, %xmm0 -; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm6, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm0 ; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: movdqa %xmm1, %xmm2 ; SSSE3-NEXT: psubd %xmm3, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 ; SSSE3-NEXT: pxor %xmm3, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pandn %xmm2, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm2 ; SSSE3-NEXT: pxor %xmm6, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v8i32: @@ -967,108 +961,102 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; SSE2-LABEL: v16i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: psubd %xmm4, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: psubd %xmm4, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pandn %xmm9, %xmm4 +; SSE2-NEXT: psrad $31, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm0 ; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: psubd %xmm5, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm8 -; SSE2-NEXT: pxor %xmm5, %xmm8 -; SSE2-NEXT: movdqa %xmm8, %xmm4 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psubd %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pxor %xmm10, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: psubd %xmm6, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 ; SSE2-NEXT: pxor %xmm6, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: pandn %xmm4, %xmm5 ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: pxor %xmm10, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: psubd %xmm7, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psubd %xmm7, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 ; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm5, %xmm2 -; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pxor %xmm10, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v16i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm1, %xmm8 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: psubd %xmm4, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pandn %xmm0, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: movdqa %xmm0, %xmm9 +; SSSE3-NEXT: psubd %xmm4, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pandn %xmm9, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm9 ; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm9 +; SSSE3-NEXT: pand %xmm9, %xmm0 ; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm8, %xmm1 -; SSSE3-NEXT: psubd %xmm5, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm8 -; SSSE3-NEXT: pxor %xmm5, %xmm8 -; SSSE3-NEXT: movdqa %xmm8, %xmm4 -; SSSE3-NEXT: pandn %xmm1, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: pand %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: psubd %xmm5, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm5, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm5 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pxor %xmm10, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: por %xmm5, %xmm1 ; SSSE3-NEXT: movdqa %xmm2, %xmm4 ; SSSE3-NEXT: psubd %xmm6, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 ; SSSE3-NEXT: pxor %xmm6, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 ; SSSE3-NEXT: pandn %xmm4, %xmm5 ; SSSE3-NEXT: psrad $31, %xmm4 ; SSSE3-NEXT: pxor %xmm10, %xmm4 -; SSSE3-NEXT: pand %xmm2, %xmm4 -; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: psubd %xmm7, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3 +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: psubd %xmm7, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 ; SSSE3-NEXT: pxor %xmm7, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pandn %xmm5, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: pand %xmm3, %xmm5 -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: pandn %xmm4, %xmm5 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pxor %xmm10, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: por %xmm5, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v16i32: Index: llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll =================================================================== --- llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll +++ llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll @@ -336,7 +336,8 @@ ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: kandb %k0, %k1, %k1 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: vblendmpd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovupd (%rsp), %zmm1 # 64-byte Reload +; CHECK-NEXT: vmovapd %zmm1, %zmm0 {%k1} ; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -369,7 +370,8 @@ ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: kandb %k0, %k1, %k1 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: vblendmpd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovupd (%rsp), %zmm1 # 64-byte Reload +; CHECK-NEXT: vmovapd %zmm1, %zmm0 {%k1} ; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -421,7 +423,8 @@ ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: kandw %k0, %k1, %k1 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: vblendmps (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; CHECK-NEXT: vmovaps %zmm1, %zmm0 {%k1} ; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -454,7 +457,8 @@ ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: kandw %k0, %k1, %k1 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: vblendmps (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; CHECK-NEXT: vmovaps %zmm1, %zmm0 {%k1} ; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq Index: llvm/test/CodeGen/X86/stack-folding-int-avx512.ll =================================================================== --- llvm/test/CodeGen/X86/stack-folding-int-avx512.ll +++ llvm/test/CodeGen/X86/stack-folding-int-avx512.ll @@ -2037,7 +2037,8 @@ ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; CHECK-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -2069,7 +2070,8 @@ ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; CHECK-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -2101,7 +2103,8 @@ ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpcmpled {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; CHECK-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq Index: llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll =================================================================== --- llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll +++ llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll @@ -39,11 +39,9 @@ define i64 @test__blcic_u64(i64 %a0) { ; X64-LABEL: test__blcic_u64: ; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: xorq $-1, %rcx -; X64-NEXT: addq $1, %rax -; X64-NEXT: andq %rcx, %rax +; X64-NEXT: leaq 1(%rdi), %rax +; X64-NEXT: xorq $-1, %rdi +; X64-NEXT: andq %rdi, %rax ; X64-NEXT: retq %1 = xor i64 %a0, -1 %2 = add i64 %a0, 1 @@ -87,11 +85,9 @@ define i64 @test__blsic_u64(i64 %a0) { ; X64-LABEL: test__blsic_u64: ; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: xorq $-1, %rcx -; X64-NEXT: subq $1, %rax -; X64-NEXT: orq %rcx, %rax +; X64-NEXT: leaq -1(%rdi), %rax +; X64-NEXT: xorq $-1, %rdi +; X64-NEXT: orq %rdi, %rax ; X64-NEXT: retq %1 = xor i64 %a0, -1 %2 = sub i64 %a0, 1 @@ -102,11 +98,9 @@ define i64 @test__t1mskc_u64(i64 %a0) { ; X64-LABEL: test__t1mskc_u64: ; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: xorq $-1, %rcx -; X64-NEXT: addq $1, %rax -; X64-NEXT: orq %rcx, %rax +; X64-NEXT: leaq 1(%rdi), %rax +; X64-NEXT: xorq $-1, %rdi +; X64-NEXT: orq %rdi, %rax ; X64-NEXT: retq %1 = xor i64 %a0, -1 %2 = add i64 %a0, 1 @@ -117,11 +111,9 @@ define i64 @test__tzmsk_u64(i64 %a0) { ; X64-LABEL: test__tzmsk_u64: ; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: xorq $-1, %rcx -; X64-NEXT: subq $1, %rax -; X64-NEXT: andq %rcx, %rax +; X64-NEXT: leaq -1(%rdi), %rax +; X64-NEXT: xorq $-1, %rdi +; X64-NEXT: andq %rdi, %rax ; X64-NEXT: retq %1 = xor i64 %a0, -1 %2 = sub i64 %a0, 1 Index: llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll =================================================================== --- llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll +++ llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll @@ -71,10 +71,10 @@ ; ; X64-LABEL: test__blcic_u32: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal 1(%rdi), %eax ; X64-NEXT: movl %edi, %ecx ; X64-NEXT: xorl $-1, %ecx -; X64-NEXT: addl $1, %eax ; X64-NEXT: andl %ecx, %eax ; X64-NEXT: retq %1 = xor i32 %a0, -1 @@ -152,10 +152,10 @@ ; ; X64-LABEL: test__blsic_u32: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal -1(%rdi), %eax ; X64-NEXT: movl %edi, %ecx ; X64-NEXT: xorl $-1, %ecx -; X64-NEXT: subl $1, %eax ; X64-NEXT: orl %ecx, %eax ; X64-NEXT: retq %1 = xor i32 %a0, -1 @@ -176,10 +176,10 @@ ; ; X64-LABEL: test__t1mskc_u32: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal 1(%rdi), %eax ; X64-NEXT: movl %edi, %ecx ; X64-NEXT: xorl $-1, %ecx -; X64-NEXT: addl $1, %eax ; X64-NEXT: orl %ecx, %eax ; X64-NEXT: retq %1 = xor i32 %a0, -1 @@ -200,10 +200,10 @@ ; ; X64-LABEL: test__tzmsk_u32: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal -1(%rdi), %eax ; X64-NEXT: movl %edi, %ecx ; X64-NEXT: xorl $-1, %ecx -; X64-NEXT: subl $1, %eax ; X64-NEXT: andl %ecx, %eax ; X64-NEXT: retq %1 = xor i32 %a0, -1 Index: llvm/test/CodeGen/X86/udiv_fix_sat.ll =================================================================== --- llvm/test/CodeGen/X86/udiv_fix_sat.ll +++ llvm/test/CodeGen/X86/udiv_fix_sat.ll @@ -172,20 +172,20 @@ ; X64: # %bb.0: ; X64-NEXT: pushq %rax ; X64-NEXT: movq %rsi, %rdx -; X64-NEXT: leaq (%rdi,%rdi), %rsi -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shrq $63, %rax -; X64-NEXT: shrdq $33, %rax, %rsi +; X64-NEXT: leaq (%rdi,%rdi), %rax +; X64-NEXT: movq %rdi, %rsi +; X64-NEXT: shrq $63, %rsi +; X64-NEXT: shldq $31, %rax, %rsi ; X64-NEXT: shlq $32, %rdi ; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: callq __udivti3@PLT ; X64-NEXT: cmpq $2, %rdx ; X64-NEXT: movq $-1, %rcx -; X64-NEXT: cmovbq %rax, %rcx +; X64-NEXT: cmovaeq %rcx, %rax ; X64-NEXT: cmpq $1, %rdx -; X64-NEXT: movl $1, %eax -; X64-NEXT: cmovbq %rdx, %rax -; X64-NEXT: shldq $63, %rcx, %rax +; X64-NEXT: movl $1, %ecx +; X64-NEXT: cmovbq %rdx, %rcx +; X64-NEXT: shrdq $1, %rcx, %rax ; X64-NEXT: popq %rcx ; X64-NEXT: retq ; Index: llvm/test/CodeGen/X86/umax.ll =================================================================== --- llvm/test/CodeGen/X86/umax.ll +++ llvm/test/CodeGen/X86/umax.ll @@ -137,12 +137,12 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind { ; X64-LABEL: test_i128: ; X64: # %bb.0: -; X64-NEXT: movq %rdx, %rax ; X64-NEXT: cmpq %rdx, %rdi -; X64-NEXT: cmovaq %rdi, %rdx -; X64-NEXT: cmpq %rcx, %rsi +; X64-NEXT: movq %rdx, %rax ; X64-NEXT: cmovaq %rdi, %rax -; X64-NEXT: cmoveq %rdx, %rax +; X64-NEXT: cmpq %rcx, %rsi +; X64-NEXT: cmovaq %rdi, %rdx +; X64-NEXT: cmovneq %rdx, %rax ; X64-NEXT: cmovaq %rsi, %rcx ; X64-NEXT: movq %rcx, %rdx ; X64-NEXT: retq @@ -358,23 +358,22 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; SSE-LABEL: test_v8i32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pxor %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pxor %xmm5, %xmm4 -; SSE-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pxor %xmm5, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm5 -; SSE-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pxor %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm4 +; SSE-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v8i32: Index: llvm/test/CodeGen/X86/umin.ll =================================================================== --- llvm/test/CodeGen/X86/umin.ll +++ llvm/test/CodeGen/X86/umin.ll @@ -137,12 +137,12 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind { ; X64-LABEL: test_i128: ; X64: # %bb.0: -; X64-NEXT: movq %rdx, %rax ; X64-NEXT: cmpq %rdx, %rdi -; X64-NEXT: cmovbq %rdi, %rdx -; X64-NEXT: cmpq %rcx, %rsi +; X64-NEXT: movq %rdx, %rax ; X64-NEXT: cmovbq %rdi, %rax -; X64-NEXT: cmoveq %rdx, %rax +; X64-NEXT: cmpq %rcx, %rsi +; X64-NEXT: cmovbq %rdi, %rdx +; X64-NEXT: cmovneq %rdx, %rax ; X64-NEXT: cmovbq %rsi, %rcx ; X64-NEXT: movq %rcx, %rdx ; X64-NEXT: retq Index: llvm/test/CodeGen/X86/umul_fix.ll =================================================================== --- llvm/test/CodeGen/X86/umul_fix.ll +++ llvm/test/CodeGen/X86/umul_fix.ll @@ -60,10 +60,9 @@ ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: adcl %edi, %edx ; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: shldl $30, %eax, %ecx +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: shldl $30, %eax, %edx ; X86-NEXT: shldl $30, %esi, %eax -; X86-NEXT: movl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -373,11 +372,11 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebp, %eax -; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: adcl %edx, %ecx ; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %edx, %ebx +; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl %edi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi Index: llvm/test/CodeGen/X86/umul_fix_sat.ll =================================================================== --- llvm/test/CodeGen/X86/umul_fix_sat.ll +++ llvm/test/CodeGen/X86/umul_fix_sat.ll @@ -282,22 +282,21 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: testl %esi, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: testl %edi, %edi ; X86-NEXT: setne %dl ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %bl ; X86-NEXT: andb %dl, %bl ; X86-NEXT: mull %ebp -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, %esi ; X86-NEXT: seto %bh -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %ecx, %edx -; X86-NEXT: movl %eax, %esi ; X86-NEXT: seto %cl ; X86-NEXT: orb %bh, %cl -; X86-NEXT: addl %edi, %esi +; X86-NEXT: addl %eax, %esi ; X86-NEXT: movl %edx, %eax ; X86-NEXT: mull %ebp ; X86-NEXT: addl %esi, %edx @@ -445,30 +444,30 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %edi +; X86-NEXT: addl %edx, %esi ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: mull %edi +; X86-NEXT: addl %esi, %eax ; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %ebx, %edx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl %ebp, %edx +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpl $1, %edi +; X86-NEXT: cmpl $1, %ebx ; X86-NEXT: sbbl %ecx, %ecx ; X86-NEXT: notl %ecx ; X86-NEXT: orl %ecx, %eax @@ -501,26 +500,26 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edi +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: mull %ebx +; X86-NEXT: addl %edx, %edi ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: addl %edi, %eax ; X86-NEXT: adcl %esi, %edx ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %ebx, %edx +; X86-NEXT: addl %ebp, %edx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: shrdl $31, %edx, %eax ; X86-NEXT: movl %edx, %esi @@ -530,9 +529,8 @@ ; X86-NEXT: sbbl %edi, %edi ; X86-NEXT: notl %edi ; X86-NEXT: orl %edi, %eax -; X86-NEXT: shldl $1, %edx, %ecx -; X86-NEXT: orl %edi, %ecx -; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shrdl $31, %ecx, %edx +; X86-NEXT: orl %edi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx Index: llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll =================================================================== --- llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll +++ llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll @@ -17,13 +17,12 @@ ; X64-NEXT: seto %r10b ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rax, %rcx ; X64-NEXT: seto %r11b ; X64-NEXT: orb %r10b, %r11b -; X64-NEXT: addq %rsi, %rcx +; X64-NEXT: addq %rax, %rsi ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r8 -; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: addq %rsi, %rdx ; X64-NEXT: setb %cl ; X64-NEXT: orb %r11b, %cl ; X64-NEXT: orb %r9b, %cl Index: llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll =================================================================== --- llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll +++ llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll @@ -19,22 +19,21 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: testl %esi, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: testl %edi, %edi ; X86-NEXT: setne %dl ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %bl ; X86-NEXT: andb %dl, %bl ; X86-NEXT: mull %ebp -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, %esi ; X86-NEXT: seto %bh -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %ecx, %edx -; X86-NEXT: movl %eax, %esi ; X86-NEXT: seto %ch ; X86-NEXT: orb %bh, %ch -; X86-NEXT: addl %edi, %esi +; X86-NEXT: addl %eax, %esi ; X86-NEXT: movl %edx, %eax ; X86-NEXT: mull %ebp ; X86-NEXT: addl %esi, %edx Index: llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll =================================================================== --- llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll +++ llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll @@ -545,18 +545,18 @@ define i32 @out_constant_varx_mone(i32 %x, i32 %y, i32 %mask) { ; CHECK-NOBMI-LABEL: out_constant_varx_mone: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %edi, %eax -; CHECK-NOBMI-NEXT: andl %edx, %eax -; CHECK-NOBMI-NEXT: notl %edx -; CHECK-NOBMI-NEXT: orl %edx, %eax +; CHECK-NOBMI-NEXT: andl %edx, %edi +; CHECK-NOBMI-NEXT: movl %edx, %eax +; CHECK-NOBMI-NEXT: notl %eax +; CHECK-NOBMI-NEXT: orl %edi, %eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out_constant_varx_mone: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: movl %edi, %eax -; CHECK-BMI-NEXT: andl %edx, %eax -; CHECK-BMI-NEXT: notl %edx -; CHECK-BMI-NEXT: orl %edx, %eax +; CHECK-BMI-NEXT: andl %edx, %edi +; CHECK-BMI-NEXT: movl %edx, %eax +; CHECK-BMI-NEXT: notl %eax +; CHECK-BMI-NEXT: orl %edi, %eax ; CHECK-BMI-NEXT: retq %notmask = xor i32 %mask, -1 %mx = and i32 %mask, %x @@ -674,11 +674,10 @@ ; CHECK-NOBMI-LABEL: out_constant_varx_42_invmask: ; CHECK-NOBMI: # %bb.0: ; CHECK-NOBMI-NEXT: movl %edx, %eax -; CHECK-NOBMI-NEXT: movl %edx, %ecx -; CHECK-NOBMI-NEXT: notl %ecx -; CHECK-NOBMI-NEXT: andl %edi, %ecx -; CHECK-NOBMI-NEXT: andl $42, %eax -; CHECK-NOBMI-NEXT: orl %ecx, %eax +; CHECK-NOBMI-NEXT: notl %eax +; CHECK-NOBMI-NEXT: andl %edi, %eax +; CHECK-NOBMI-NEXT: andl $42, %edx +; CHECK-NOBMI-NEXT: orl %edx, %eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out_constant_varx_42_invmask: @@ -758,18 +757,18 @@ define i32 @out_constant_mone_vary_invmask(i32 %x, i32 %y, i32 %mask) { ; CHECK-NOBMI-LABEL: out_constant_mone_vary_invmask: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %esi, %eax -; CHECK-NOBMI-NEXT: andl %edx, %eax -; CHECK-NOBMI-NEXT: notl %edx -; CHECK-NOBMI-NEXT: orl %edx, %eax +; CHECK-NOBMI-NEXT: andl %edx, %esi +; CHECK-NOBMI-NEXT: movl %edx, %eax +; CHECK-NOBMI-NEXT: notl %eax +; CHECK-NOBMI-NEXT: orl %esi, %eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out_constant_mone_vary_invmask: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: movl %esi, %eax -; CHECK-BMI-NEXT: andl %edx, %eax -; CHECK-BMI-NEXT: notl %edx -; CHECK-BMI-NEXT: orl %edx, %eax +; CHECK-BMI-NEXT: andl %edx, %esi +; CHECK-BMI-NEXT: movl %edx, %eax +; CHECK-BMI-NEXT: notl %eax +; CHECK-BMI-NEXT: orl %esi, %eax ; CHECK-BMI-NEXT: retq %notmask = xor i32 %mask, -1 %mx = and i32 %notmask, -1 @@ -846,20 +845,20 @@ define i32 @out_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) { ; CHECK-NOBMI-LABEL: out_constant_42_vary_invmask: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %esi, %eax -; CHECK-NOBMI-NEXT: andl %edx, %eax -; CHECK-NOBMI-NEXT: notl %edx -; CHECK-NOBMI-NEXT: andl $42, %edx -; CHECK-NOBMI-NEXT: orl %edx, %eax +; CHECK-NOBMI-NEXT: andl %edx, %esi +; CHECK-NOBMI-NEXT: movl %edx, %eax +; CHECK-NOBMI-NEXT: notl %eax +; CHECK-NOBMI-NEXT: andl $42, %eax +; CHECK-NOBMI-NEXT: orl %esi, %eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out_constant_42_vary_invmask: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: movl %esi, %eax -; CHECK-BMI-NEXT: andl %edx, %eax -; CHECK-BMI-NEXT: notl %edx -; CHECK-BMI-NEXT: andl $42, %edx -; CHECK-BMI-NEXT: orl %edx, %eax +; CHECK-BMI-NEXT: andl %edx, %esi +; CHECK-BMI-NEXT: movl %edx, %eax +; CHECK-BMI-NEXT: notl %eax +; CHECK-BMI-NEXT: andl $42, %eax +; CHECK-BMI-NEXT: orl %esi, %eax ; CHECK-BMI-NEXT: retq %notmask = xor i32 %mask, -1 %mx = and i32 %notmask, 42 Index: llvm/test/CodeGen/X86/urem-lkk.ll =================================================================== --- llvm/test/CodeGen/X86/urem-lkk.ll +++ llvm/test/CodeGen/X86/urem-lkk.ll @@ -41,18 +41,17 @@ define i32 @combine_urem_udiv(i32 %x) { ; CHECK-LABEL: combine_urem_udiv: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: imulq $1491936009, %rax, %rax # imm = 0x58ED2309 -; CHECK-NEXT: shrq $32, %rax -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: subl %eax, %ecx -; CHECK-NEXT: shrl %ecx -; CHECK-NEXT: addl %eax, %ecx -; CHECK-NEXT: shrl $6, %ecx -; CHECK-NEXT: imull $95, %ecx, %eax -; CHECK-NEXT: subl %eax, %edi -; CHECK-NEXT: leal (%rdi,%rcx), %eax +; CHECK-NEXT: imulq $1491936009, %rax, %rcx # imm = 0x58ED2309 +; CHECK-NEXT: shrq $32, %rcx +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: shrl %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: shrl $6, %eax +; CHECK-NEXT: imull $95, %eax, %ecx +; CHECK-NEXT: subl %ecx, %edi +; CHECK-NEXT: addl %edi, %eax ; CHECK-NEXT: retq %1 = urem i32 %x, 95 %2 = udiv i32 %x, 95 Index: llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll =================================================================== --- llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll +++ llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll @@ -279,9 +279,9 @@ ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295,4294967295,1431655764] ; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm0, %xmm0 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_tautological: Index: llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll +++ llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll @@ -3037,30 +3037,28 @@ define <4 x i32> @strict_vector_fptoui_v4f32_to_v4i32(<4 x float> %a) #0 { ; SSE-32-LABEL: strict_vector_fptoui_v4f32_to_v4i32: ; SSE-32: # %bb.0: -; SSE-32-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; SSE-32-NEXT: movaps %xmm0, %xmm3 -; SSE-32-NEXT: cmpltps %xmm2, %xmm3 -; SSE-32-NEXT: movaps %xmm3, %xmm1 -; SSE-32-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; SSE-32-NEXT: andnps %xmm2, %xmm3 -; SSE-32-NEXT: subps %xmm3, %xmm0 +; SSE-32-NEXT: movaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; SSE-32-NEXT: movaps %xmm0, %xmm2 +; SSE-32-NEXT: cmpltps %xmm1, %xmm2 +; SSE-32-NEXT: movaps %xmm2, %xmm3 +; SSE-32-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; SSE-32-NEXT: andnps %xmm1, %xmm2 +; SSE-32-NEXT: subps %xmm2, %xmm0 ; SSE-32-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-32-NEXT: xorps %xmm0, %xmm1 -; SSE-32-NEXT: movaps %xmm1, %xmm0 +; SSE-32-NEXT: xorps %xmm3, %xmm0 ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: strict_vector_fptoui_v4f32_to_v4i32: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; SSE-64-NEXT: movaps %xmm0, %xmm3 -; SSE-64-NEXT: cmpltps %xmm2, %xmm3 -; SSE-64-NEXT: movaps %xmm3, %xmm1 -; SSE-64-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-64-NEXT: andnps %xmm2, %xmm3 -; SSE-64-NEXT: subps %xmm3, %xmm0 +; SSE-64-NEXT: movaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; SSE-64-NEXT: movaps %xmm0, %xmm2 +; SSE-64-NEXT: cmpltps %xmm1, %xmm2 +; SSE-64-NEXT: movaps %xmm2, %xmm3 +; SSE-64-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE-64-NEXT: andnps %xmm1, %xmm2 +; SSE-64-NEXT: subps %xmm2, %xmm0 ; SSE-64-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-64-NEXT: xorps %xmm0, %xmm1 -; SSE-64-NEXT: movaps %xmm1, %xmm0 +; SSE-64-NEXT: xorps %xmm3, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-LABEL: strict_vector_fptoui_v4f32_to_v4i32: Index: llvm/test/CodeGen/X86/vec_ctbits.ll =================================================================== --- llvm/test/CodeGen/X86/vec_ctbits.ll +++ llvm/test/CodeGen/X86/vec_ctbits.ll @@ -23,11 +23,10 @@ ; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $4, %xmm1 -; CHECK-NEXT: paddb %xmm0, %xmm1 -; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: psadbw %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: paddb %xmm1, %xmm0 +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psadbw %xmm1, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true) ret <2 x i64> %c @@ -93,11 +92,10 @@ ; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $4, %xmm1 -; CHECK-NEXT: paddb %xmm0, %xmm1 -; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: psadbw %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: paddb %xmm1, %xmm0 +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psadbw %xmm1, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) ret <2 x i64> %c Index: llvm/test/CodeGen/X86/vec_minmax_sint.ll =================================================================== --- llvm/test/CodeGen/X86/vec_minmax_sint.ll +++ llvm/test/CodeGen/X86/vec_minmax_sint.ll @@ -120,16 +120,16 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 @@ -192,8 +192,7 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_gt_v4i32: @@ -222,14 +221,12 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_gt_v8i32: @@ -319,8 +316,7 @@ ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_gt_v16i8: @@ -349,14 +345,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_gt_v32i8: @@ -507,16 +501,16 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 @@ -579,8 +573,7 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_ge_v4i32: @@ -609,14 +602,12 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_ge_v8i32: @@ -706,8 +697,7 @@ ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_ge_v16i8: @@ -736,14 +726,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_ge_v32i8: Index: llvm/test/CodeGen/X86/vec_minmax_uint.ll =================================================================== --- llvm/test/CodeGen/X86/vec_minmax_uint.ll +++ llvm/test/CodeGen/X86/vec_minmax_uint.ll @@ -130,16 +130,16 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 @@ -245,23 +245,22 @@ define <8 x i32> @max_gt_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: max_gt_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_gt_v8i32: @@ -537,16 +536,16 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 @@ -652,23 +651,22 @@ define <8 x i32> @max_ge_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: max_ge_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: max_ge_v8i32: Index: llvm/test/CodeGen/X86/vec_saddo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_saddo.ll +++ llvm/test/CodeGen/X86/vec_saddo.ll @@ -817,11 +817,11 @@ ; SSE2-NEXT: pslld $8, %xmm2 ; SSE2-NEXT: psrad $8, %xmm2 ; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pslld $8, %xmm0 -; SSE2-NEXT: psrad $8, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pslld $8, %xmm1 +; SSE2-NEXT: psrad $8, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: movw %ax, (%rdi) @@ -852,11 +852,11 @@ ; SSSE3-NEXT: pslld $8, %xmm2 ; SSSE3-NEXT: psrad $8, %xmm2 ; SSSE3-NEXT: paddd %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pslld $8, %xmm0 -; SSSE3-NEXT: psrad $8, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pslld $8, %xmm1 +; SSSE3-NEXT: psrad $8, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSSE3-NEXT: movd %xmm2, %eax ; SSSE3-NEXT: movw %ax, (%rdi) @@ -881,25 +881,24 @@ ; ; SSE41-LABEL: saddo_v4i24: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pslld $8, %xmm1 ; SSE41-NEXT: psrad $8, %xmm1 -; SSE41-NEXT: pslld $8, %xmm2 -; SSE41-NEXT: psrad $8, %xmm2 -; SSE41-NEXT: paddd %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pslld $8, %xmm0 ; SSE41-NEXT: psrad $8, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: paddd %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pslld $8, %xmm2 +; SSE41-NEXT: psrad $8, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pextrd $3, %xmm2, %eax +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: pextrd $3, %xmm0, %eax ; SSE41-NEXT: movw %ax, 9(%rdi) -; SSE41-NEXT: pextrd $2, %xmm2, %ecx +; SSE41-NEXT: pextrd $2, %xmm0, %ecx ; SSE41-NEXT: movw %cx, 6(%rdi) -; SSE41-NEXT: pextrd $1, %xmm2, %edx +; SSE41-NEXT: pextrd $1, %xmm0, %edx ; SSE41-NEXT: movw %dx, 3(%rdi) -; SSE41-NEXT: movd %xmm2, %esi +; SSE41-NEXT: movd %xmm0, %esi ; SSE41-NEXT: movw %si, (%rdi) ; SSE41-NEXT: shrl $16, %eax ; SSE41-NEXT: movb %al, 11(%rdi) @@ -909,6 +908,7 @@ ; SSE41-NEXT: movb %dl, 5(%rdi) ; SSE41-NEXT: shrl $16, %esi ; SSE41-NEXT: movb %sil, 2(%rdi) +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: saddo_v4i24: @@ -989,11 +989,10 @@ ; SSE-NEXT: pslld $31, %xmm1 ; SSE-NEXT: movmskps %xmm1, %eax ; SSE-NEXT: psrad $31, %xmm1 -; SSE-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: movb %al, (%rdi) -; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: saddo_v4i1: Index: llvm/test/CodeGen/X86/vec_sdiv_to_shift.ll =================================================================== --- llvm/test/CodeGen/X86/vec_sdiv_to_shift.ll +++ llvm/test/CodeGen/X86/vec_sdiv_to_shift.ll @@ -9,9 +9,8 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psraw $15, %xmm1 ; SSE-NEXT: psrlw $11, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: psraw $5, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: psraw $5, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: sdiv_vec8x16: @@ -32,9 +31,8 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psraw $15, %xmm1 ; SSE-NEXT: psrlw $11, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: psraw $5, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: psraw $5, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: sdiv_vec8x16_minsize: @@ -55,9 +53,8 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psrad $31, %xmm1 ; SSE-NEXT: psrld $28, %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: psrad $4, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: psrad $4, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: sdiv_vec4x32: @@ -104,15 +101,13 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrad $31, %xmm2 ; SSE-NEXT: psrld $26, %xmm2 -; SSE-NEXT: paddd %xmm0, %xmm2 -; SSE-NEXT: psrad $6, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrad $31, %xmm3 -; SSE-NEXT: psrld $26, %xmm3 -; SSE-NEXT: paddd %xmm1, %xmm3 -; SSE-NEXT: psrad $6, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: psrad $6, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: psrld $26, %xmm2 +; SSE-NEXT: paddd %xmm2, %xmm1 +; SSE-NEXT: psrad $6, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: sdiv8x32: @@ -147,15 +142,13 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psraw $15, %xmm2 ; SSE-NEXT: psrlw $14, %xmm2 -; SSE-NEXT: paddw %xmm0, %xmm2 -; SSE-NEXT: psraw $2, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psraw $15, %xmm3 -; SSE-NEXT: psrlw $14, %xmm3 -; SSE-NEXT: paddw %xmm1, %xmm3 -; SSE-NEXT: psraw $2, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm0 +; SSE-NEXT: psraw $2, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psraw $15, %xmm2 +; SSE-NEXT: psrlw $14, %xmm2 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: psraw $2, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: sdiv16x16: Index: llvm/test/CodeGen/X86/vec_shift6.ll =================================================================== --- llvm/test/CodeGen/X86/vec_shift6.ll +++ llvm/test/CodeGen/X86/vec_shift6.ll @@ -69,16 +69,14 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pslld $1, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test4: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pslld $1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: test4: Index: llvm/test/CodeGen/X86/vec_smulo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_smulo.ll +++ llvm/test/CodeGen/X86/vec_smulo.ll @@ -112,9 +112,10 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] ; SSE41-NEXT: pmulld %xmm1, %xmm0 ; SSE41-NEXT: movq %xmm0, (%rdi) -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -246,9 +247,10 @@ ; SSE41-NEXT: pmulld %xmm1, %xmm0 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi) ; SSE41-NEXT: movq %xmm0, (%rdi) -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -378,9 +380,10 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] ; SSE41-NEXT: pmulld %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -3094,19 +3097,19 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] ; SSE41-NEXT: pmulld %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pslld $8, %xmm3 -; SSE41-NEXT: psrad $8, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pslld $8, %xmm0 +; SSE41-NEXT: psrad $8, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE41-NEXT: pextrd $3, %xmm1, %eax ; SSE41-NEXT: pextrd $2, %xmm1, %ecx ; SSE41-NEXT: pextrd $1, %xmm1, %edx ; SSE41-NEXT: movd %xmm1, %esi ; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: movw %ax, 9(%rdi) ; SSE41-NEXT: movw %cx, 6(%rdi) Index: llvm/test/CodeGen/X86/vec_ssubo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_ssubo.ll +++ llvm/test/CodeGen/X86/vec_ssubo.ll @@ -826,11 +826,11 @@ ; SSE2-NEXT: pslld $8, %xmm2 ; SSE2-NEXT: psrad $8, %xmm2 ; SSE2-NEXT: psubd %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pslld $8, %xmm0 -; SSE2-NEXT: psrad $8, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pslld $8, %xmm1 +; SSE2-NEXT: psrad $8, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: movw %ax, (%rdi) @@ -861,11 +861,11 @@ ; SSSE3-NEXT: pslld $8, %xmm2 ; SSSE3-NEXT: psrad $8, %xmm2 ; SSSE3-NEXT: psubd %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pslld $8, %xmm0 -; SSSE3-NEXT: psrad $8, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pslld $8, %xmm1 +; SSSE3-NEXT: psrad $8, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSSE3-NEXT: movd %xmm2, %eax ; SSSE3-NEXT: movw %ax, (%rdi) @@ -890,25 +890,24 @@ ; ; SSE41-LABEL: ssubo_v4i24: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pslld $8, %xmm1 ; SSE41-NEXT: psrad $8, %xmm1 -; SSE41-NEXT: pslld $8, %xmm2 -; SSE41-NEXT: psrad $8, %xmm2 -; SSE41-NEXT: psubd %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pslld $8, %xmm0 ; SSE41-NEXT: psrad $8, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: psubd %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pslld $8, %xmm2 +; SSE41-NEXT: psrad $8, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pextrd $3, %xmm2, %eax +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: pextrd $3, %xmm0, %eax ; SSE41-NEXT: movw %ax, 9(%rdi) -; SSE41-NEXT: pextrd $2, %xmm2, %ecx +; SSE41-NEXT: pextrd $2, %xmm0, %ecx ; SSE41-NEXT: movw %cx, 6(%rdi) -; SSE41-NEXT: pextrd $1, %xmm2, %edx +; SSE41-NEXT: pextrd $1, %xmm0, %edx ; SSE41-NEXT: movw %dx, 3(%rdi) -; SSE41-NEXT: movd %xmm2, %esi +; SSE41-NEXT: movd %xmm0, %esi ; SSE41-NEXT: movw %si, (%rdi) ; SSE41-NEXT: shrl $16, %eax ; SSE41-NEXT: movb %al, 11(%rdi) @@ -918,6 +917,7 @@ ; SSE41-NEXT: movb %dl, 5(%rdi) ; SSE41-NEXT: shrl $16, %esi ; SSE41-NEXT: movb %sil, 2(%rdi) +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: ssubo_v4i24: @@ -998,11 +998,10 @@ ; SSE-NEXT: pslld $31, %xmm1 ; SSE-NEXT: movmskps %xmm1, %eax ; SSE-NEXT: psrad $31, %xmm1 -; SSE-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: movb %al, (%rdi) -; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: ssubo_v4i1: Index: llvm/test/CodeGen/X86/vec_umulo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_umulo.ll +++ llvm/test/CodeGen/X86/vec_umulo.ll @@ -2740,29 +2740,29 @@ ; ; SSE41-LABEL: umulo_v4i24: ; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] -; SSE41-NEXT: pand %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE41-NEXT: pmuludq %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pmuludq %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; SSE41-NEXT: pmuludq %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] ; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: pmulld %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm4 +; SSE41-NEXT: pmulld %xmm0, %xmm1 ; SSE41-NEXT: pextrd $3, %xmm1, %eax ; SSE41-NEXT: pextrd $2, %xmm1, %ecx ; SSE41-NEXT: pextrd $1, %xmm1, %edx ; SSE41-NEXT: movd %xmm1, %esi -; SSE41-NEXT: psrld $24, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $24, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: movw %ax, 9(%rdi) ; SSE41-NEXT: movw %cx, 6(%rdi) ; SSE41-NEXT: movw %dx, 3(%rdi) Index: llvm/test/CodeGen/X86/vector-bitreverse.ll =================================================================== --- llvm/test/CodeGen/X86/vector-bitreverse.ll +++ llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -19,7 +19,6 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind { ; SSE-LABEL: test_bitreverse_i8: ; SSE: # %bb.0: -; SSE-NEXT: # kill: def $edi killed $edi def $rdi ; SSE-NEXT: rolb $4, %dil ; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: andb $51, %al @@ -32,13 +31,11 @@ ; SSE-NEXT: addb %al, %al ; SSE-NEXT: shrb %dil ; SSE-NEXT: andb $85, %dil -; SSE-NEXT: addl %edi, %eax -; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: orb %dil, %al ; SSE-NEXT: retq ; ; AVX-LABEL: test_bitreverse_i8: ; AVX: # %bb.0: -; AVX-NEXT: # kill: def $edi killed $edi def $rdi ; AVX-NEXT: rolb $4, %dil ; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: andb $51, %al @@ -51,8 +48,7 @@ ; AVX-NEXT: addb %al, %al ; AVX-NEXT: shrb %dil ; AVX-NEXT: andb $85, %dil -; AVX-NEXT: addl %edi, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: orb %dil, %al ; AVX-NEXT: retq ; ; XOP-LABEL: test_bitreverse_i8: @@ -65,7 +61,6 @@ ; ; GFNISSE-LABEL: test_bitreverse_i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi ; GFNISSE-NEXT: rolb $4, %dil ; GFNISSE-NEXT: movl %edi, %eax ; GFNISSE-NEXT: andb $51, %al @@ -78,13 +73,11 @@ ; GFNISSE-NEXT: addb %al, %al ; GFNISSE-NEXT: shrb %dil ; GFNISSE-NEXT: andb $85, %dil -; GFNISSE-NEXT: addl %edi, %eax -; GFNISSE-NEXT: # kill: def $al killed $al killed $eax +; GFNISSE-NEXT: orb %dil, %al ; GFNISSE-NEXT: retq ; ; GFNIAVX-LABEL: test_bitreverse_i8: ; GFNIAVX: # %bb.0: -; GFNIAVX-NEXT: # kill: def $edi killed $edi def $rdi ; GFNIAVX-NEXT: rolb $4, %dil ; GFNIAVX-NEXT: movl %edi, %eax ; GFNIAVX-NEXT: andb $51, %al @@ -97,13 +90,11 @@ ; GFNIAVX-NEXT: addb %al, %al ; GFNIAVX-NEXT: shrb %dil ; GFNIAVX-NEXT: andb $85, %dil -; GFNIAVX-NEXT: addl %edi, %eax -; GFNIAVX-NEXT: # kill: def $al killed $al killed $eax +; GFNIAVX-NEXT: orb %dil, %al ; GFNIAVX-NEXT: retq ; ; GFNIAVX2-LABEL: test_bitreverse_i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: # kill: def $edi killed $edi def $rdi ; GFNIAVX2-NEXT: rolb $4, %dil ; GFNIAVX2-NEXT: movl %edi, %eax ; GFNIAVX2-NEXT: andb $51, %al @@ -116,13 +107,11 @@ ; GFNIAVX2-NEXT: addb %al, %al ; GFNIAVX2-NEXT: shrb %dil ; GFNIAVX2-NEXT: andb $85, %dil -; GFNIAVX2-NEXT: addl %edi, %eax -; GFNIAVX2-NEXT: # kill: def $al killed $al killed $eax +; GFNIAVX2-NEXT: orb %dil, %al ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512F-LABEL: test_bitreverse_i8: ; GFNIAVX512F: # %bb.0: -; GFNIAVX512F-NEXT: # kill: def $edi killed $edi def $rdi ; GFNIAVX512F-NEXT: rolb $4, %dil ; GFNIAVX512F-NEXT: movl %edi, %eax ; GFNIAVX512F-NEXT: andb $51, %al @@ -135,13 +124,11 @@ ; GFNIAVX512F-NEXT: addb %al, %al ; GFNIAVX512F-NEXT: shrb %dil ; GFNIAVX512F-NEXT: andb $85, %dil -; GFNIAVX512F-NEXT: addl %edi, %eax -; GFNIAVX512F-NEXT: # kill: def $al killed $al killed $eax +; GFNIAVX512F-NEXT: orb %dil, %al ; GFNIAVX512F-NEXT: retq ; ; GFNIAVX512BW-LABEL: test_bitreverse_i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: # kill: def $edi killed $edi def $rdi ; GFNIAVX512BW-NEXT: rolb $4, %dil ; GFNIAVX512BW-NEXT: movl %edi, %eax ; GFNIAVX512BW-NEXT: andb $51, %al @@ -154,8 +141,7 @@ ; GFNIAVX512BW-NEXT: addb %al, %al ; GFNIAVX512BW-NEXT: shrb %dil ; GFNIAVX512BW-NEXT: andb $85, %dil -; GFNIAVX512BW-NEXT: addl %edi, %eax -; GFNIAVX512BW-NEXT: # kill: def $al killed $al killed $eax +; GFNIAVX512BW-NEXT: orb %dil, %al ; GFNIAVX512BW-NEXT: retq %b = call i8 @llvm.bitreverse.i8(i8 %a) ret i8 %b Index: llvm/test/CodeGen/X86/vector-ext-logic.ll =================================================================== --- llvm/test/CodeGen/X86/vector-ext-logic.ll +++ llvm/test/CodeGen/X86/vector-ext-logic.ll @@ -348,21 +348,21 @@ define <8 x i32> @bool_sext_and(<8 x i1> %x, <8 x i1> %y) { ; SSE2-LABEL: bool_sext_and: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -386,21 +386,21 @@ define <8 x i32> @bool_sext_or(<8 x i1> %x, <8 x i1> %y) { ; SSE2-LABEL: bool_sext_or: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -424,21 +424,21 @@ define <8 x i32> @bool_sext_xor(<8 x i1> %x, <8 x i1> %y) { ; SSE2-LABEL: bool_sext_xor: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq Index: llvm/test/CodeGen/X86/vector-fshl-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-128.ll +++ llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -35,14 +35,14 @@ ; SSE2-NEXT: psrlq %xmm4, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; SSE2-NEXT: psrlq %xmm4, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] +; SSE2-NEXT: shufpd{{.*#+}} xmm5 = xmm5[0],xmm1[1] ; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psllq %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllq %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE2-NEXT: psllq %xmm2, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: orpd %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_funnnel_v2i64: @@ -55,14 +55,14 @@ ; SSE41-NEXT: psrlq %xmm4, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; SSE41-NEXT: psrlq %xmm4, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: pand %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psllq %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psllq %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE41-NEXT: psllq %xmm2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_funnnel_v2i64: @@ -178,22 +178,22 @@ ; ; X86-SSE2-LABEL: var_funnnel_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: pandn %xmm3, %xmm4 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pandn %xmm4, %xmm5 ; X86-SSE2-NEXT: psrlq $1, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 -; X86-SSE2-NEXT: psrlq %xmm4, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] -; X86-SSE2-NEXT: pand %xmm3, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: psllq %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: psrlq %xmm5, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; X86-SSE2-NEXT: psrlq %xmm5, %xmm1 +; X86-SSE2-NEXT: shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1] +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psllq %xmm2, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X86-SSE2-NEXT: psllq %xmm2, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm1, %xmm0 +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X86-SSE2-NEXT: orpd %xmm3, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ret <2 x i64> %res @@ -254,14 +254,14 @@ ; SSE41-NEXT: psrld %xmm4, %xmm6 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm4, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7] ; SSE41-NEXT: pand %xmm8, %xmm2 ; SSE41-NEXT: pslld $23, %xmm2 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE41-NEXT: pmulld %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 +; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_funnnel_v4i32: @@ -453,32 +453,31 @@ ; SSE2-NEXT: por %xmm5, %xmm3 ; SSE2-NEXT: paddw %xmm4, %xmm4 ; SSE2-NEXT: psraw $15, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm3 ; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $23, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $23, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] +; SSE2-NEXT: paddd %xmm5, %xmm4 +; SSE2-NEXT: cvttps2dq %xmm4, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $23, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: cvttps2dq %xmm2, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm5, %xmm2 +; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_funnnel_v8i16: @@ -522,9 +521,8 @@ ; SSE41-NEXT: paddd %xmm4, %xmm0 ; SSE41-NEXT: cvttps2dq %xmm0, %xmm0 ; SSE41-NEXT: packusdw %xmm2, %xmm0 -; SSE41-NEXT: pmullw %xmm0, %xmm3 -; SSE41-NEXT: por %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pmullw %xmm3, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_funnnel_v8i16: @@ -699,32 +697,31 @@ ; X86-SSE2-NEXT: por %xmm5, %xmm3 ; X86-SSE2-NEXT: paddw %xmm4, %xmm4 ; X86-SSE2-NEXT: psraw $15, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 -; X86-SSE2-NEXT: pandn %xmm3, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm1 ; X86-SSE2-NEXT: psrlw $1, %xmm3 ; X86-SSE2-NEXT: pand %xmm4, %xmm3 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: pslld $23, %xmm1 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] -; X86-SSE2-NEXT: paddd %xmm4, %xmm1 -; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: pslld $23, %xmm4 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] +; X86-SSE2-NEXT: paddd %xmm5, %xmm4 +; X86-SSE2-NEXT: cvttps2dq %xmm4, %xmm4 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; X86-SSE2-NEXT: pslld $23, %xmm2 -; X86-SSE2-NEXT: paddd %xmm4, %xmm2 -; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; X86-SSE2-NEXT: pmullw %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: por %xmm3, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: paddd %xmm5, %xmm2 +; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; X86-SSE2-NEXT: pmullw %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: por %xmm3, %xmm0 ; X86-SSE2-NEXT: retl %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ret <8 x i16> %res @@ -1171,23 +1168,23 @@ ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: pandn %xmm3, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: pandn %xmm4, %xmm5 ; X86-SSE2-NEXT: psrlq $1, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 -; X86-SSE2-NEXT: psrlq %xmm4, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] -; X86-SSE2-NEXT: pand %xmm3, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: psllq %xmm2, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; X86-SSE2-NEXT: psllq %xmm2, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlq %xmm5, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; X86-SSE2-NEXT: psrlq %xmm5, %xmm1 +; X86-SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] +; X86-SSE2-NEXT: pand %xmm4, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psllq %xmm3, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; X86-SSE2-NEXT: psllq %xmm3, %xmm0 +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X86-SSE2-NEXT: orpd %xmm2, %xmm0 ; X86-SSE2-NEXT: retl %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat) @@ -2066,12 +2063,12 @@ ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlq $60, %xmm2 ; SSE2-NEXT: psrlq $50, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psllq $4, %xmm2 +; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllq $4, %xmm1 ; SSE2-NEXT: psllq $14, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: orpd %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_funnnel_v2i64: @@ -2079,12 +2076,12 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psrlq $50, %xmm2 ; SSE41-NEXT: psrlq $60, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllq $14, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psllq $14, %xmm1 ; SSE41-NEXT: psllq $4, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_funnnel_v2i64: @@ -2164,23 +2161,23 @@ ; ; X86-SSE2-LABEL: constant_funnnel_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,0,63,0] -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = <4,u,14,u> -; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE2-NEXT: pandn %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = <4,u,14,u> +; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: pandn %xmm3, %xmm5 ; X86-SSE2-NEXT: psrlq $1, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 -; X86-SSE2-NEXT: psrlq %xmm4, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] -; X86-SSE2-NEXT: pand %xmm2, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psllq %xmm3, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlq %xmm5, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; X86-SSE2-NEXT: psrlq %xmm5, %xmm1 +; X86-SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psllq %xmm4, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] ; X86-SSE2-NEXT: psllq %xmm3, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm1, %xmm0 +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X86-SSE2-NEXT: orpd %xmm2, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> ) ret <2 x i64> %res @@ -2220,10 +2217,10 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psrld $26, %xmm2 ; SSE41-NEXT: psrld $28, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_funnnel_v4i32: Index: llvm/test/CodeGen/X86/vector-fshl-rot-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -1126,17 +1126,16 @@ ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; SSE2-NEXT: psubb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] -; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psllw %xmm3, %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psllw %xmm1, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: psllw %xmm3, %xmm5 +; SSE2-NEXT: psllw %xmm1, %xmm5 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -1146,32 +1145,30 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_funnnel_v16i8: ; SSE41: # %bb.0: ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw %xmm3, %xmm2 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psllw %xmm2, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE41-NEXT: psllw %xmm3, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pshufb %xmm3, %xmm5 -; SSE41-NEXT: pand %xmm5, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE41-NEXT: psubb %xmm1, %xmm3 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psllw %xmm2, %xmm5 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pshufb %xmm2, %xmm5 +; SSE41-NEXT: pand %xmm3, %xmm5 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE41-NEXT: psubb %xmm1, %xmm2 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: psrlw %xmm1, %xmm0 ; SSE41-NEXT: psrlw %xmm1, %xmm4 ; SSE41-NEXT: pshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: pand %xmm0, %xmm4 -; SSE41-NEXT: por %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_funnnel_v16i8: @@ -1336,17 +1333,16 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; X86-SSE2-NEXT: psubb %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] -; X86-SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psllw %xmm3, %xmm1 +; X86-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X86-SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: psllw %xmm1, %xmm3 ; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm4 ; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; X86-SSE2-NEXT: psllw %xmm3, %xmm5 +; X86-SSE2-NEXT: psllw %xmm1, %xmm5 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,0,0,4,5,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,0,0,0,4,5,6,7] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X86-SSE2-NEXT: pand %xmm3, %xmm1 ; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -1356,9 +1352,8 @@ ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> %splat) Index: llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll +++ llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll @@ -439,24 +439,22 @@ define <2 x i32> @splatconstant_funnnel_v2i32(<2 x i32> %x) nounwind { ; SSE2-LABEL: splatconstant_funnnel_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld $28, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pslld $4, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psrld $28, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pslld $4, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movsd %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatconstant_funnnel_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrld $28, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pslld $4, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $28, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pslld $4, %xmm2 +; SSE41-NEXT: por %xmm1, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatconstant_funnnel_v2i32: @@ -520,13 +518,12 @@ ; ; X86-SSE2-LABEL: splatconstant_funnnel_v2i32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $28, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pslld $4, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; X86-SSE2-NEXT: movaps %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $28, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pslld $4, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movsd %xmm2, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %x, <2 x i32> ) ret <2 x i32> %res Index: llvm/test/CodeGen/X86/vector-fshr-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-128.ll +++ llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -34,15 +34,15 @@ ; SSE2-NEXT: psrlq %xmm4, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; SSE2-NEXT: psrlq %xmm4, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] +; SSE2-NEXT: shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1] ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: psllq $1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psllq %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllq %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE2-NEXT: psllq %xmm2, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: orpd %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_funnnel_v2i64: @@ -54,15 +54,15 @@ ; SSE41-NEXT: psrlq %xmm4, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; SSE41-NEXT: psrlq %xmm4, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: pandn %xmm3, %xmm2 ; SSE41-NEXT: psllq $1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psllq %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psllq %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE41-NEXT: psllq %xmm2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_funnnel_v2i64: @@ -179,22 +179,22 @@ ; ; X86-SSE2-LABEL: var_funnnel_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: pand %xmm3, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 -; X86-SSE2-NEXT: psrlq %xmm4, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] -; X86-SSE2-NEXT: pandn %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pand %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: psrlq %xmm5, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; X86-SSE2-NEXT: psrlq %xmm5, %xmm1 +; X86-SSE2-NEXT: shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1] +; X86-SSE2-NEXT: pandn %xmm4, %xmm2 ; X86-SSE2-NEXT: psllq $1, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: psllq %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psllq %xmm2, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X86-SSE2-NEXT: psllq %xmm2, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm1, %xmm0 +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X86-SSE2-NEXT: orpd %xmm3, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ret <2 x i64> %res @@ -254,15 +254,15 @@ ; SSE41-NEXT: psrld %xmm4, %xmm6 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm4, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7] ; SSE41-NEXT: pandn %xmm8, %xmm2 ; SSE41-NEXT: pslld $23, %xmm2 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 +; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 ; SSE41-NEXT: pslld $1, %xmm0 -; SSE41-NEXT: pmulld %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_funnnel_v4i32: @@ -456,17 +456,17 @@ ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pandn %xmm1, %xmm4 ; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $23, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $23, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] -; SSE2-NEXT: paddd %xmm5, %xmm3 -; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $23, %xmm2 ; SSE2-NEXT: paddd %xmm5, %xmm2 @@ -474,11 +474,11 @@ ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE2-NEXT: psllw $1, %xmm0 ; SSE2-NEXT: pmullw %xmm2, %xmm0 ; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_funnnel_v8i16: @@ -701,17 +701,17 @@ ; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE2-NEXT: pandn %xmm1, %xmm4 ; X86-SSE2-NEXT: psrlw $1, %xmm1 -; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 ; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: pslld $23, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: pslld $23, %xmm1 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] -; X86-SSE2-NEXT: paddd %xmm5, %xmm3 -; X86-SSE2-NEXT: cvttps2dq %xmm3, %xmm3 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; X86-SSE2-NEXT: paddd %xmm5, %xmm1 +; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; X86-SSE2-NEXT: pslld $23, %xmm2 ; X86-SSE2-NEXT: paddd %xmm5, %xmm2 @@ -719,11 +719,11 @@ ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] ; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; X86-SSE2-NEXT: psllw $1, %xmm0 ; X86-SSE2-NEXT: pmullw %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm4, %xmm0 -; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: por %xmm3, %xmm0 ; X86-SSE2-NEXT: retl %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ret <8 x i16> %res @@ -732,65 +732,65 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind { ; SSE2-LABEL: var_funnnel_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: psllw $5, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: psllw $5, %xmm6 ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtb %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm7 ; SSE2-NEXT: pandn %xmm1, %xmm7 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: por %xmm7, %xmm1 -; SSE2-NEXT: paddb %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtb %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: paddb %xmm6, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm7 ; SSE2-NEXT: pandn %xmm1, %xmm7 ; SSE2-NEXT: psrlw $2, %xmm1 -; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: por %xmm7, %xmm1 -; SSE2-NEXT: paddb %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtb %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm5 +; SSE2-NEXT: paddb %xmm6, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pandn %xmm1, %xmm6 ; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: pandn %xmm5, %xmm2 ; SSE2-NEXT: psllw $5, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: pandn %xmm0, %xmm5 ; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: pandn %xmm0, %xmm5 ; SSE2-NEXT: psllw $2, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_funnnel_v16i8: @@ -993,65 +993,65 @@ ; ; X86-SSE2-LABEL: var_funnnel_v16i8: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE2-NEXT: pand %xmm4, %xmm5 -; X86-SSE2-NEXT: psllw $5, %xmm5 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE2-NEXT: pand %xmm5, %xmm6 +; X86-SSE2-NEXT: psllw $5, %xmm6 ; X86-SSE2-NEXT: pxor %xmm3, %xmm3 -; X86-SSE2-NEXT: pxor %xmm6, %xmm6 -; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm6 -; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE2-NEXT: pxor %xmm4, %xmm4 +; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm7 ; X86-SSE2-NEXT: pandn %xmm1, %xmm7 ; X86-SSE2-NEXT: psrlw $4, %xmm1 -; X86-SSE2-NEXT: pand %xmm6, %xmm1 +; X86-SSE2-NEXT: pand %xmm4, %xmm1 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: por %xmm7, %xmm1 -; X86-SSE2-NEXT: paddb %xmm5, %xmm5 -; X86-SSE2-NEXT: pxor %xmm6, %xmm6 -; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm6 -; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE2-NEXT: paddb %xmm6, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm4 +; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm7 ; X86-SSE2-NEXT: pandn %xmm1, %xmm7 ; X86-SSE2-NEXT: psrlw $2, %xmm1 -; X86-SSE2-NEXT: pand %xmm6, %xmm1 +; X86-SSE2-NEXT: pand %xmm4, %xmm1 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: por %xmm7, %xmm1 -; X86-SSE2-NEXT: paddb %xmm5, %xmm5 -; X86-SSE2-NEXT: pxor %xmm6, %xmm6 -; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm6 -; X86-SSE2-NEXT: movdqa %xmm6, %xmm5 -; X86-SSE2-NEXT: pandn %xmm1, %xmm5 +; X86-SSE2-NEXT: paddb %xmm6, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm4 +; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm6 +; X86-SSE2-NEXT: pandn %xmm1, %xmm6 ; X86-SSE2-NEXT: psrlw $1, %xmm1 -; X86-SSE2-NEXT: pand %xmm6, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: pandn %xmm4, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm4 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 +; X86-SSE2-NEXT: por %xmm6, %xmm4 +; X86-SSE2-NEXT: pandn %xmm5, %xmm2 ; X86-SSE2-NEXT: psllw $5, %xmm2 -; X86-SSE2-NEXT: pxor %xmm4, %xmm4 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4 +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 ; X86-SSE2-NEXT: paddb %xmm0, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 ; X86-SSE2-NEXT: pandn %xmm0, %xmm5 ; X86-SSE2-NEXT: psllw $4, %xmm0 -; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: por %xmm5, %xmm0 ; X86-SSE2-NEXT: paddb %xmm2, %xmm2 -; X86-SSE2-NEXT: pxor %xmm4, %xmm4 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 ; X86-SSE2-NEXT: pandn %xmm0, %xmm5 ; X86-SSE2-NEXT: psllw $2, %xmm0 -; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: por %xmm5, %xmm0 ; X86-SSE2-NEXT: paddb %xmm2, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm4, %xmm1 ; X86-SSE2-NEXT: paddb %xmm0, %xmm0 ; X86-SSE2-NEXT: pand %xmm3, %xmm0 -; X86-SSE2-NEXT: por %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ret <16 x i8> %res @@ -1159,23 +1159,23 @@ ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: pand %xmm3, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 -; X86-SSE2-NEXT: psrlq %xmm4, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] -; X86-SSE2-NEXT: pandn %xmm3, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: pand %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlq %xmm5, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; X86-SSE2-NEXT: psrlq %xmm5, %xmm1 +; X86-SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] +; X86-SSE2-NEXT: pandn %xmm4, %xmm3 ; X86-SSE2-NEXT: psllq $1, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: psllq %xmm2, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; X86-SSE2-NEXT: psllq %xmm2, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psllq %xmm3, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; X86-SSE2-NEXT: psllq %xmm3, %xmm0 +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X86-SSE2-NEXT: orpd %xmm2, %xmm0 ; X86-SSE2-NEXT: retl %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat) @@ -1768,12 +1768,12 @@ ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlq $4, %xmm2 ; SSE2-NEXT: psrlq $14, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psllq $60, %xmm2 +; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllq $60, %xmm1 ; SSE2-NEXT: psllq $50, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: orpd %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_funnnel_v2i64: @@ -1781,12 +1781,12 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psrlq $14, %xmm2 ; SSE41-NEXT: psrlq $4, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllq $50, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psllq $50, %xmm1 ; SSE41-NEXT: psllq $60, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_funnnel_v2i64: @@ -1867,23 +1867,23 @@ ; ; X86-SSE2-LABEL: constant_funnnel_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,0,63,0] -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = <4,u,14,u> -; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE2-NEXT: pand %xmm2, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 -; X86-SSE2-NEXT: psrlq %xmm4, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] -; X86-SSE2-NEXT: pandn %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = <4,u,14,u> +; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: pand %xmm3, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: psrlq %xmm5, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; X86-SSE2-NEXT: psrlq %xmm5, %xmm1 +; X86-SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] +; X86-SSE2-NEXT: pandn %xmm3, %xmm4 ; X86-SSE2-NEXT: psllq $1, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psllq %xmm3, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psllq %xmm4, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] ; X86-SSE2-NEXT: psllq %xmm3, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm1, %xmm0 +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X86-SSE2-NEXT: orpd %xmm2, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> ) ret <2 x i64> %res @@ -1923,10 +1923,10 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psrld $6, %xmm2 ; SSE41-NEXT: psrld $4, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_funnnel_v4i32: Index: llvm/test/CodeGen/X86/vector-fshr-rot-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -1202,30 +1202,29 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: psubb %xmm1, %xmm2 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE2-NEXT: psubb %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE2-NEXT: psubb %xmm2, %xmm1 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psllw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psllw %xmm2, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE2-NEXT: psllw %xmm2, %xmm5 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,0,0,0,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] -; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psrlw %xmm3, %xmm0 -; SSE2-NEXT: psrlw %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: psrlw %xmm1, %xmm4 ; SSE2-NEXT: psrlw $8, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_funnnel_v16i8: @@ -1234,23 +1233,22 @@ ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: psubb %xmm1, %xmm3 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllw %xmm4, %xmm1 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psllw %xmm1, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE41-NEXT: psllw %xmm4, %xmm6 +; SSE41-NEXT: psllw %xmm1, %xmm6 ; SSE41-NEXT: pshufb %xmm2, %xmm6 -; SSE41-NEXT: pand %xmm6, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE41-NEXT: psubb %xmm3, %xmm2 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psrlw %xmm2, %xmm0 -; SSE41-NEXT: psrlw %xmm2, %xmm5 +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE41-NEXT: psubb %xmm3, %xmm1 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 +; SSE41-NEXT: psrlw %xmm1, %xmm5 ; SSE41-NEXT: pshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: pand %xmm0, %xmm5 -; SSE41-NEXT: por %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_funnnel_v16i8: @@ -1421,30 +1419,29 @@ ; X86-SSE2-NEXT: pxor %xmm2, %xmm2 ; X86-SSE2-NEXT: psubb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X86-SSE2-NEXT: psubb %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X86-SSE2-NEXT: psubb %xmm2, %xmm1 ; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psllw %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: psllw %xmm2, %xmm3 ; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm4 ; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm5 ; X86-SSE2-NEXT: psllw %xmm2, %xmm5 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,0,0,0,4,5,6,7] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] -; X86-SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-SSE2-NEXT: psrlw %xmm3, %xmm0 -; X86-SSE2-NEXT: psrlw %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X86-SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE2-NEXT: psrlw %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw %xmm1, %xmm4 ; X86-SSE2-NEXT: psrlw $8, %xmm4 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: retl %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> %splat) Index: llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll +++ llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll @@ -466,24 +466,22 @@ define <2 x i32> @splatconstant_funnnel_v2i32(<2 x i32> %x) nounwind { ; SSE2-LABEL: splatconstant_funnnel_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld $4, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pslld $28, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psrld $4, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pslld $28, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movsd %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatconstant_funnnel_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrld $4, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pslld $28, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $4, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pslld $28, %xmm2 +; SSE41-NEXT: por %xmm1, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatconstant_funnnel_v2i32: @@ -547,13 +545,12 @@ ; ; X86-SSE2-LABEL: splatconstant_funnnel_v2i32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrld $4, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pslld $28, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; X86-SSE2-NEXT: movaps %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pslld $28, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movsd %xmm2, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %x, <2 x i32> ) ret <2 x i32> %res Index: llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -542,8 +542,7 @@ ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pslld $3, %xmm2 ; SSE2-NEXT: psubd %xmm2, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_rem7_4i32: @@ -652,8 +651,7 @@ ; SSE2-NEXT: psllw $3, %xmm2 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: psubb %xmm2, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_rem7_16i8: @@ -679,8 +677,7 @@ ; SSE41-NEXT: psllw $3, %xmm2 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: psubb %xmm2, %xmm1 -; SSE41-NEXT: paddb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_rem7_16i8: Index: llvm/test/CodeGen/X86/vector-lzcnt-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-lzcnt-128.ll +++ llvm/test/CodeGen/X86/vector-lzcnt-128.ll @@ -123,8 +123,7 @@ ; SSSE3-NEXT: psrlq $32, %xmm0 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: psrlq $32, %xmm1 -; SSSE3-NEXT: paddq %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv2i64: @@ -156,8 +155,7 @@ ; SSE41-NEXT: psrlq $32, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: psrlq $32, %xmm1 -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv2i64: @@ -258,8 +256,7 @@ ; X32-SSE-NEXT: psrlq $32, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrlq $32, %xmm1 -; X32-SSE-NEXT: paddq %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: paddq %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0) @@ -376,8 +373,7 @@ ; SSSE3-NEXT: psrlq $32, %xmm0 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: psrlq $32, %xmm1 -; SSSE3-NEXT: paddq %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv2i64u: @@ -409,8 +405,7 @@ ; SSE41-NEXT: psrlq $32, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: psrlq $32, %xmm1 -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv2i64u: @@ -511,8 +506,7 @@ ; X32-SSE-NEXT: psrlq $32, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrlq $32, %xmm1 -; X32-SSE-NEXT: paddq %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: paddq %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 -1) @@ -606,56 +600,54 @@ ; ; SSSE3-LABEL: testv4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: psrlw $4, %xmm3 +; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: paddb %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: paddb %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm4, %xmm2 -; SSSE3-NEXT: psrlw $8, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: paddw %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: psrlw $8, %xmm3 +; SSSE3-NEXT: paddw %xmm1, %xmm3 ; SSSE3-NEXT: pcmpeqw %xmm4, %xmm0 ; SSSE3-NEXT: psrld $16, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: psrld $16, %xmm1 -; SSSE3-NEXT: paddd %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: psrld $16, %xmm3 +; SSSE3-NEXT: paddd %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pshufb %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pshufb %xmm3, %xmm1 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: paddb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqb %xmm4, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddw %xmm2, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: paddw %xmm1, %xmm3 ; SSE41-NEXT: pcmpeqw %xmm4, %xmm0 ; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: paddd %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv4i32: @@ -740,8 +732,7 @@ ; X32-SSE-NEXT: psrld $16, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrld $16, %xmm1 -; X32-SSE-NEXT: paddd %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: paddd %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 0) @@ -835,56 +826,54 @@ ; ; SSSE3-LABEL: testv4i32u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: psrlw $4, %xmm3 +; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: paddb %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: paddb %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm4, %xmm2 -; SSSE3-NEXT: psrlw $8, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: paddw %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: psrlw $8, %xmm3 +; SSSE3-NEXT: paddw %xmm1, %xmm3 ; SSSE3-NEXT: pcmpeqw %xmm4, %xmm0 ; SSSE3-NEXT: psrld $16, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: psrld $16, %xmm1 -; SSSE3-NEXT: paddd %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: psrld $16, %xmm3 +; SSSE3-NEXT: paddd %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv4i32u: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pshufb %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm2 +; SSE41-NEXT: pshufb %xmm3, %xmm1 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: paddb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqb %xmm4, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddw %xmm2, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: paddw %xmm1, %xmm3 ; SSE41-NEXT: pcmpeqw %xmm4, %xmm0 ; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: paddd %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv4i32u: @@ -969,8 +958,7 @@ ; X32-SSE-NEXT: psrld $16, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrld $16, %xmm1 -; X32-SSE-NEXT: paddd %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: paddd %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 -1) @@ -1052,44 +1040,42 @@ ; ; SSSE3-LABEL: testv8i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: psrlw $4, %xmm3 +; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: paddb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: paddb %xmm1, %xmm3 ; SSSE3-NEXT: pcmpeqb %xmm4, %xmm0 ; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: paddw %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: psrlw $8, %xmm3 +; SSSE3-NEXT: paddw %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pshufb %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: paddb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm3, %xmm1 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm3 ; SSE41-NEXT: pcmpeqb %xmm4, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: paddw %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv8i16: @@ -1163,8 +1149,7 @@ ; X32-SSE-NEXT: psrlw $8, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrlw $8, %xmm1 -; X32-SSE-NEXT: paddw %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 0) ret <8 x i16> %out @@ -1245,44 +1230,42 @@ ; ; SSSE3-LABEL: testv8i16u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrlw $4, %xmm1 -; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: psrlw $4, %xmm3 +; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pshufb %xmm1, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: paddb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: paddb %xmm1, %xmm3 ; SSSE3-NEXT: pcmpeqb %xmm4, %xmm0 ; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: paddw %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: psrlw $8, %xmm3 +; SSSE3-NEXT: paddw %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv8i16u: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $4, %xmm1 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pshufb %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pshufb %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: paddb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm3, %xmm1 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm1, %xmm3 ; SSE41-NEXT: pcmpeqb %xmm4, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: paddw %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv8i16u: @@ -1356,8 +1339,7 @@ ; X32-SSE-NEXT: psrlw $8, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrlw $8, %xmm1 -; X32-SSE-NEXT: paddw %xmm0, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: paddw %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 -1) ret <8 x i16> %out Index: llvm/test/CodeGen/X86/vector-mul.ll =================================================================== --- llvm/test/CodeGen/X86/vector-mul.ll +++ llvm/test/CodeGen/X86/vector-mul.ll @@ -223,8 +223,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psllq $4, %xmm1 -; SSE-NEXT: paddq %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: paddq %xmm1, %xmm0 ; SSE-NEXT: ret{{[l|q]}} ; ; X64-XOP-LABEL: mul_v2i64_17: @@ -302,8 +301,7 @@ ; X86-SSE-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE-NEXT: psllw $4, %xmm1 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE-NEXT: paddb %xmm0, %xmm1 -; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: paddb %xmm1, %xmm0 ; X86-SSE-NEXT: retl ; ; X64-SSE-LABEL: mul_v16i8_17: @@ -311,8 +309,7 @@ ; X64-SSE-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE-NEXT: psllw $4, %xmm1 ; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-SSE-NEXT: paddb %xmm0, %xmm1 -; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: paddb %xmm1, %xmm0 ; X64-SSE-NEXT: retq ; ; X64-XOP-LABEL: mul_v16i8_17: @@ -343,12 +340,10 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psllq $4, %xmm2 -; SSE-NEXT: paddq %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psllq $4, %xmm3 -; SSE-NEXT: paddq %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: paddq %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psllq $4, %xmm2 +; SSE-NEXT: paddq %xmm2, %xmm1 ; SSE-NEXT: ret{{[l|q]}} ; ; X64-XOP-LABEL: mul_v4i64_17: @@ -443,15 +438,13 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psllw $4, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psllw $4, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: paddb %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: paddb %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psllw $4, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: paddb %xmm2, %xmm1 ; SSE-NEXT: ret{{[l|q]}} ; ; X64-XOP-LABEL: mul_v32i8_17: Index: llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll =================================================================== --- llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll +++ llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll @@ -14,14 +14,13 @@ define <16 x i8> @ugt_1_v16i8(<16 x i8> %0) { ; SSE-LABEL: ugt_1_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: paddb %xmm2, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddb %xmm1, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: ugt_1_v16i8: @@ -141,10 +140,9 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_2_v16i8: @@ -161,10 +159,9 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_2_v16i8: @@ -426,10 +423,9 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_3_v16i8: @@ -446,10 +442,9 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_3_v16i8: @@ -711,10 +706,9 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_4_v16i8: @@ -731,10 +725,9 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_4_v16i8: @@ -996,10 +989,9 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_5_v16i8: @@ -1016,10 +1008,9 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_5_v16i8: @@ -1281,10 +1272,9 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_6_v16i8: @@ -1301,10 +1291,9 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_6_v16i8: @@ -1554,14 +1543,13 @@ define <8 x i16> @ugt_1_v8i16(<8 x i16> %0) { ; SSE-LABEL: ugt_1_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: paddw %xmm2, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: ugt_1_v8i16: @@ -5816,14 +5804,13 @@ define <4 x i32> @ugt_1_v4i32(<4 x i32> %0) { ; SSE-LABEL: ugt_1_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: paddd %xmm2, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: ugt_1_v4i32: @@ -17020,14 +17007,13 @@ ; ; SSE41-LABEL: ugt_1_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: paddq %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpeqq %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: ugt_1_v2i64: Index: llvm/test/CodeGen/X86/vector-popcnt-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-popcnt-128.ll +++ llvm/test/CodeGen/X86/vector-popcnt-128.ll @@ -25,11 +25,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv2i64: @@ -46,11 +45,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: psadbw %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv2i64: @@ -475,9 +473,8 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv16i8: @@ -494,9 +491,8 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv16i8: @@ -688,8 +684,9 @@ ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: paddq %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,0,3,2] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] @@ -703,8 +700,9 @@ ; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE3-NEXT: paddq %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,0,3,2] +; SSE3-NEXT: movdqa %xmm0, %xmm3 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] ; SSE3-NEXT: pand %xmm3, %xmm0 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] @@ -718,8 +716,9 @@ ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 ; SSSE3-NEXT: paddq %xmm0, %xmm2 ; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,0,3,2] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] @@ -811,12 +810,12 @@ ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,0,3,2] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ne_1_v2i64: @@ -828,12 +827,12 @@ ; SSE3-NEXT: pand %xmm0, %xmm3 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,0,3,2] -; SSE3-NEXT: pand %xmm4, %xmm0 +; SSE3-NEXT: pand %xmm0, %xmm4 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,0,3,2] -; SSE3-NEXT: pand %xmm3, %xmm1 -; SSE3-NEXT: pxor %xmm2, %xmm1 -; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] +; SSE3-NEXT: pand %xmm3, %xmm0 +; SSE3-NEXT: pxor %xmm2, %xmm0 +; SSE3-NEXT: por %xmm4, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ne_1_v2i64: @@ -845,26 +844,26 @@ ; SSSE3-NEXT: pand %xmm0, %xmm3 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,0,3,2] -; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm4 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,0,3,2] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: por %xmm4, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ne_1_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: paddq %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqq %xmm2, %xmm0 -; SSE41-NEXT: pcmpeqq %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm3, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: paddq %xmm3, %xmm4 +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: ne_1_v2i64: @@ -1017,16 +1016,16 @@ define <4 x i32> @ne_1_v4i32(<4 x i32> %0) { ; SSE-LABEL: ne_1_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: paddd %xmm3, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: paddd %xmm3, %xmm4 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: ne_1_v4i32: @@ -1146,16 +1145,16 @@ define <8 x i16> @ne_1_v8i16(<8 x i16> %0) { ; SSE-LABEL: ne_1_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pcmpeqw %xmm1, %xmm2 ; SSE-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: paddw %xmm3, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pcmpeqw %xmm2, %xmm0 -; SSE-NEXT: pcmpeqw %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: paddw %xmm3, %xmm4 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pcmpeqw %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: ne_1_v8i16: @@ -1273,16 +1272,16 @@ define <16 x i8> @ne_1_v16i8(<16 x i8> %0) { ; SSE-LABEL: ne_1_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pcmpeqb %xmm1, %xmm2 ; SSE-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: paddb %xmm3, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pcmpeqb %xmm2, %xmm0 -; SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: paddb %xmm3, %xmm4 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: ne_1_v16i8: Index: llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -281,8 +281,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: addss %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32_zero: @@ -334,8 +333,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: addss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v4f32_zero: @@ -390,8 +388,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: addss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v8f32_zero: @@ -462,8 +459,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: addss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v16f32_zero: @@ -526,8 +522,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: addss %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32_undef: @@ -579,8 +574,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: addss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v4f32_undef: @@ -635,8 +629,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: addss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v8f32_undef: @@ -707,8 +700,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: addss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v16f32_undef: @@ -995,8 +987,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: addsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v2f64_zero: @@ -1031,8 +1022,7 @@ ; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: addsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v4f64_zero: @@ -1081,8 +1071,7 @@ ; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: addsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v8f64_zero: @@ -1202,8 +1191,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: addsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v2f64_undef: @@ -1238,8 +1226,7 @@ ; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: addsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v4f64_undef: @@ -1288,8 +1275,7 @@ ; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: addsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v8f64_undef: Index: llvm/test/CodeGen/X86/vector-reduce-fadd.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-fadd.ll +++ llvm/test/CodeGen/X86/vector-reduce-fadd.ll @@ -340,8 +340,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: addss %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32_zero: @@ -1346,8 +1345,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: addsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v2f64_zero: @@ -1381,11 +1379,10 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm2 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: addsd %xmm0, %xmm2 -; SSE-NEXT: addsd %xmm1, %xmm2 +; SSE-NEXT: addsd %xmm2, %xmm0 +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: addsd %xmm1, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v4f64_zero: @@ -1439,17 +1436,16 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: addsd %xmm0, %xmm4 -; SSE-NEXT: addsd %xmm1, %xmm4 +; SSE-NEXT: addsd %xmm4, %xmm0 +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: addsd %xmm1, %xmm4 -; SSE-NEXT: addsd %xmm2, %xmm4 +; SSE-NEXT: addsd %xmm1, %xmm0 +; SSE-NEXT: addsd %xmm2, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: addsd %xmm2, %xmm4 -; SSE-NEXT: addsd %xmm3, %xmm4 +; SSE-NEXT: addsd %xmm2, %xmm0 +; SSE-NEXT: addsd %xmm3, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] -; SSE-NEXT: addsd %xmm3, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm0 +; SSE-NEXT: addsd %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v8f64_zero: @@ -1531,7 +1527,7 @@ ; SSE-LABEL: test_v16f64_zero: ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: addsd %xmm8, %xmm0 ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] Index: llvm/test/CodeGen/X86/vector-reduce-fmax.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-fmax.ll +++ llvm/test/CodeGen/X86/vector-reduce-fmax.ll @@ -69,45 +69,41 @@ define float @test_v3f32(<3 x float> %a0) { ; SSE2-LABEL: test_v3f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: cmpunordss %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm3 -; SSE2-NEXT: andps %xmm2, %xmm3 -; SSE2-NEXT: maxss %xmm0, %xmm2 -; SSE2-NEXT: andnps %xmm2, %xmm1 -; SSE2-NEXT: orps %xmm3, %xmm1 -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: maxss %xmm1, %xmm2 -; SSE2-NEXT: cmpunordss %xmm1, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm3 -; SSE2-NEXT: andnps %xmm2, %xmm3 -; SSE2-NEXT: andps %xmm0, %xmm1 -; SSE2-NEXT: orps %xmm3, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: cmpunordss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: orps %xmm3, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: maxss %xmm2, %xmm1 +; SSE2-NEXT: cmpunordss %xmm2, %xmm2 +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: orps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v3f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: cmpunordss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: andps %xmm2, %xmm3 -; SSE41-NEXT: maxss %xmm0, %xmm2 -; SSE41-NEXT: andnps %xmm2, %xmm1 -; SSE41-NEXT: orps %xmm3, %xmm1 -; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm2 -; SSE41-NEXT: maxss %xmm1, %xmm2 -; SSE41-NEXT: cmpunordss %xmm1, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: andnps %xmm2, %xmm3 -; SSE41-NEXT: andps %xmm0, %xmm1 -; SSE41-NEXT: orps %xmm3, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: cmpunordss %xmm0, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: andps %xmm1, %xmm3 +; SSE41-NEXT: maxss %xmm0, %xmm1 +; SSE41-NEXT: andnps %xmm1, %xmm2 +; SSE41-NEXT: orps %xmm3, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: cmpunordss %xmm2, %xmm2 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: andnps %xmm1, %xmm2 +; SSE41-NEXT: orps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v3f32: Index: llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll +++ llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll @@ -204,8 +204,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: mulss %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32_zero: @@ -246,8 +245,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32_zero: @@ -288,8 +286,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32_zero: @@ -340,8 +337,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32_zero: @@ -381,8 +377,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: mulss %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32_undef: @@ -423,8 +418,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32_undef: @@ -465,8 +459,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32_undef: @@ -517,8 +510,7 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: mulss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32_undef: @@ -704,8 +696,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_zero: @@ -729,8 +720,7 @@ ; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_zero: @@ -762,8 +752,7 @@ ; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64_zero: @@ -841,8 +830,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_undef: @@ -866,8 +854,7 @@ ; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_undef: @@ -899,8 +886,7 @@ ; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64_undef: Index: llvm/test/CodeGen/X86/vector-reduce-fmul.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-fmul.ll +++ llvm/test/CodeGen/X86/vector-reduce-fmul.ll @@ -339,8 +339,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: mulss %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v2f32_one: @@ -1211,8 +1210,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: mulsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_one: @@ -1235,11 +1233,10 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm2 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: mulsd %xmm0, %xmm2 -; SSE-NEXT: mulsd %xmm1, %xmm2 +; SSE-NEXT: mulsd %xmm2, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: mulsd %xmm1, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_one: @@ -1272,17 +1269,16 @@ ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: mulsd %xmm0, %xmm4 -; SSE-NEXT: mulsd %xmm1, %xmm4 +; SSE-NEXT: mulsd %xmm4, %xmm0 +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: mulsd %xmm1, %xmm4 -; SSE-NEXT: mulsd %xmm2, %xmm4 +; SSE-NEXT: mulsd %xmm1, %xmm0 +; SSE-NEXT: mulsd %xmm2, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: mulsd %xmm2, %xmm4 -; SSE-NEXT: mulsd %xmm3, %xmm4 +; SSE-NEXT: mulsd %xmm2, %xmm0 +; SSE-NEXT: mulsd %xmm3, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] -; SSE-NEXT: mulsd %xmm3, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm0 +; SSE-NEXT: mulsd %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64_one: @@ -1329,7 +1325,7 @@ ; SSE-LABEL: test_v16f64_one: ; SSE: # %bb.0: ; SSE-NEXT: movapd %xmm0, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: mulsd %xmm8, %xmm0 ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] Index: llvm/test/CodeGen/X86/vector-reduce-smax.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-smax.ll +++ llvm/test/CodeGen/X86/vector-reduce-smax.ll @@ -37,16 +37,16 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movq %xmm2, %rax ; SSE41-NEXT: retq @@ -128,16 +128,16 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm1, %xmm4 @@ -284,16 +284,16 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 @@ -561,16 +561,16 @@ ; SSE41-NEXT: pand %xmm10, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm1 ; SSE41-NEXT: pxor %xmm9, %xmm1 -; SSE41-NEXT: movdqa %xmm8, %xmm3 -; SSE41-NEXT: pxor %xmm9, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 Index: llvm/test/CodeGen/X86/vector-reduce-umax.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -37,16 +37,16 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movq %xmm2, %rax ; SSE41-NEXT: retq @@ -134,16 +134,16 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm1, %xmm4 @@ -306,16 +306,16 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 @@ -610,16 +610,16 @@ ; SSE41-NEXT: pand %xmm10, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm1 ; SSE41-NEXT: pxor %xmm9, %xmm1 -; SSE41-NEXT: movdqa %xmm8, %xmm3 -; SSE41-NEXT: pxor %xmm9, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 Index: llvm/test/CodeGen/X86/vector-rotate-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-rotate-128.ll +++ llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -1096,17 +1096,16 @@ ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; SSE2-NEXT: psubb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] -; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psllw %xmm3, %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psllw %xmm1, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: psllw %xmm3, %xmm5 +; SSE2-NEXT: psllw %xmm1, %xmm5 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -1116,32 +1115,30 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_rotate_v16i8: ; SSE41: # %bb.0: ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw %xmm3, %xmm2 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psllw %xmm2, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE41-NEXT: psllw %xmm3, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pshufb %xmm3, %xmm5 -; SSE41-NEXT: pand %xmm5, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE41-NEXT: psubb %xmm1, %xmm3 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psllw %xmm2, %xmm5 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pshufb %xmm2, %xmm5 +; SSE41-NEXT: pand %xmm3, %xmm5 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE41-NEXT: psubb %xmm1, %xmm2 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: psrlw %xmm1, %xmm0 ; SSE41-NEXT: psrlw %xmm1, %xmm4 ; SSE41-NEXT: pshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: pand %xmm0, %xmm4 -; SSE41-NEXT: por %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_rotate_v16i8: @@ -1288,17 +1285,16 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; X86-SSE2-NEXT: psubb %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] -; X86-SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psllw %xmm3, %xmm1 +; X86-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X86-SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: psllw %xmm1, %xmm3 ; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm4 ; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; X86-SSE2-NEXT: psllw %xmm3, %xmm5 +; X86-SSE2-NEXT: psllw %xmm1, %xmm5 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,0,0,4,5,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,0,0,0,4,5,6,7] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X86-SSE2-NEXT: pand %xmm3, %xmm1 ; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -1308,9 +1304,8 @@ ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer %splat8 = sub <16 x i8> , %splat Index: llvm/test/CodeGen/X86/vector-shift-ashr-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -1091,11 +1091,11 @@ ; SSE2-NEXT: psraw $2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] ; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: andps %xmm0, %xmm1 ; SSE2-NEXT: psraw $1, %xmm2 -; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: andnps %xmm2, %xmm0 ; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1162,11 +1162,11 @@ ; X86-SSE-NEXT: psraw $2, %xmm1 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] ; X86-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X86-SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; X86-SSE-NEXT: movaps %xmm2, %xmm0 -; X86-SSE-NEXT: andps %xmm1, %xmm0 +; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] +; X86-SSE-NEXT: movaps %xmm2, %xmm1 +; X86-SSE-NEXT: andps %xmm0, %xmm1 ; X86-SSE-NEXT: psraw $1, %xmm2 -; X86-SSE-NEXT: andnps %xmm2, %xmm1 +; X86-SSE-NEXT: andnps %xmm2, %xmm0 ; X86-SSE-NEXT: orps %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = ashr <8 x i16> %a, Index: llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -1784,11 +1784,11 @@ ; SSE2-NEXT: psraw $2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535] -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,65535,65535,65535] +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: andps %xmm0, %xmm2 ; SSE2-NEXT: psraw $1, %xmm1 -; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: andnps %xmm1, %xmm0 ; SSE2-NEXT: orps %xmm2, %xmm0 ; SSE2-NEXT: retq ; @@ -1851,11 +1851,11 @@ ; X86-SSE-NEXT: psraw $2, %xmm1 ; X86-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] ; X86-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; X86-SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535] -; X86-SSE-NEXT: movaps %xmm1, %xmm0 -; X86-SSE-NEXT: andps %xmm2, %xmm0 +; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,65535,65535,65535] +; X86-SSE-NEXT: movaps %xmm1, %xmm2 +; X86-SSE-NEXT: andps %xmm0, %xmm2 ; X86-SSE-NEXT: psraw $1, %xmm1 -; X86-SSE-NEXT: andnps %xmm1, %xmm2 +; X86-SSE-NEXT: andnps %xmm1, %xmm0 ; X86-SSE-NEXT: orps %xmm2, %xmm0 ; X86-SSE-NEXT: retl %shift = ashr <4 x i16> %a, Index: llvm/test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2693,8 +2693,7 @@ ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE2-NEXT: addps %xmm0, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: addps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: PR22390: @@ -2702,8 +2701,7 @@ ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] ; SSSE3-NEXT: movaps %xmm0, %xmm2 ; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSSE3-NEXT: addps %xmm0, %xmm2 -; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: addps %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: PR22390: Index: llvm/test/CodeGen/X86/vector-trunc-math.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc-math.ll +++ llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -2816,8 +2816,8 @@ ; SSE-NEXT: packuswb %xmm7, %xmm6 ; SSE-NEXT: pand %xmm8, %xmm5 ; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: packuswb %xmm5, %xmm0 ; SSE-NEXT: packuswb %xmm6, %xmm0 ; SSE-NEXT: retq @@ -2868,8 +2868,8 @@ ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: packuswb %xmm3, %xmm0 ; SSE-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-tzcnt-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-tzcnt-128.ll +++ llvm/test/CodeGen/X86/vector-tzcnt-128.ll @@ -33,11 +33,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv2i64: @@ -57,11 +56,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: psadbw %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv2i64: @@ -236,11 +234,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv2i64u: @@ -260,11 +257,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: psadbw %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv2i64u: @@ -1277,9 +1273,8 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv16i8: @@ -1299,9 +1294,8 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv16i8: @@ -1434,9 +1428,8 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv16i8u: @@ -1456,9 +1449,8 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv16i8u: Index: llvm/test/CodeGen/X86/vector-unsigned-cmp.ll =================================================================== --- llvm/test/CodeGen/X86/vector-unsigned-cmp.ll +++ llvm/test/CodeGen/X86/vector-unsigned-cmp.ll @@ -402,9 +402,9 @@ ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: psrlw $1, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pmaxub %xmm0, %xmm1 -; SSE-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pmaxub %xmm0, %xmm2 +; SSE-NEXT: pcmpeqb %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: uge_v16i8: @@ -430,9 +430,9 @@ ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: psrlw $1, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pminub %xmm0, %xmm1 -; SSE-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pminub %xmm0, %xmm2 +; SSE-NEXT: pcmpeqb %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: ule_v16i8: Index: llvm/test/CodeGen/X86/vselect-minmax.ll =================================================================== --- llvm/test/CodeGen/X86/vselect-minmax.ll +++ llvm/test/CodeGen/X86/vselect-minmax.ll @@ -63,8 +63,7 @@ ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test3: @@ -89,8 +88,7 @@ ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test4: @@ -383,8 +381,7 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test19: @@ -409,8 +406,7 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test20: @@ -637,14 +633,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test27: @@ -684,14 +678,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test28: @@ -1229,14 +1221,12 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test43: @@ -1276,14 +1266,12 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test44: @@ -1421,23 +1409,22 @@ define <8 x i32> @test47(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: test47: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test47: @@ -1473,23 +1460,22 @@ define <8 x i32> @test48(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: test48: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test48: @@ -1529,8 +1515,7 @@ ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test49: @@ -1555,8 +1540,7 @@ ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test50: @@ -1849,8 +1833,7 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test65: @@ -1875,8 +1858,7 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test66: @@ -2063,14 +2045,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test73: @@ -2110,14 +2090,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test74: @@ -2655,14 +2633,12 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test89: @@ -2702,14 +2678,12 @@ ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test90: @@ -2835,23 +2809,22 @@ define <8 x i32> @test93(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: test93: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test93: @@ -2887,23 +2860,22 @@ define <8 x i32> @test94(<8 x i32> %a, <8 x i32> %b) { ; SSE2-LABEL: test94: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test94: @@ -3191,26 +3163,22 @@ ; SSE2-NEXT: pcmpgtb %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtb %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test99: @@ -3267,26 +3235,22 @@ ; SSE2-NEXT: pcmpgtb %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtb %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test100: @@ -4101,26 +4065,22 @@ ; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test115: @@ -4168,26 +4128,22 @@ ; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test116: @@ -4381,40 +4337,38 @@ define <16 x i32> @test119(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: test119: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 ; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm8 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm9 -; SSE2-NEXT: por %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test119: @@ -4458,40 +4412,38 @@ define <16 x i32> @test120(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: test120: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 ; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm8 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm9 -; SSE2-NEXT: por %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test120: @@ -5619,26 +5571,22 @@ ; SSE2-NEXT: pcmpgtb %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtb %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test129: @@ -5695,26 +5643,22 @@ ; SSE2-NEXT: pcmpgtb %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtb %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test130: @@ -6547,26 +6491,22 @@ ; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test145: @@ -6614,26 +6554,22 @@ ; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test146: @@ -6803,40 +6739,38 @@ define <16 x i32> @test149(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: test149: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 ; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm8 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm9 -; SSE2-NEXT: por %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test149: @@ -6880,40 +6814,38 @@ define <16 x i32> @test150(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: test150: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 ; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm10 ; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm8 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm9 -; SSE2-NEXT: por %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE4-LABEL: test150: Index: llvm/test/CodeGen/X86/vselect-zero.ll =================================================================== --- llvm/test/CodeGen/X86/vselect-zero.ll +++ llvm/test/CodeGen/X86/vselect-zero.ll @@ -142,12 +142,11 @@ ; SSE2-LABEL: vsel_nonzero_constants: ; SSE2: # %bb.0: ; SSE2-NEXT: cmplepd %xmm0, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movapd %xmm1, %xmm2 -; SSE2-NEXT: andnpd %xmm0, %xmm2 -; SSE2-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: orpd %xmm2, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: orpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: vsel_nonzero_constants: Index: llvm/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll =================================================================== --- llvm/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll +++ llvm/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll @@ -42,8 +42,7 @@ ; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2ss %eax, %xmm0 -; CHECK-NEXT: addss %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq Index: llvm/test/CodeGen/X86/x86-shifts.ll =================================================================== --- llvm/test/CodeGen/X86/x86-shifts.ll +++ llvm/test/CodeGen/X86/x86-shifts.ll @@ -152,14 +152,13 @@ define <2 x i64> @shr2_nosplat(<2 x i64> %A) nounwind { ; CHECK-LABEL: shr2_nosplat: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: psrlq $8, %xmm2 ; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $1, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; CHECK-NEXT: xorps %xmm2, %xmm1 -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: psrlq $8, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: psrlq $1, %xmm2 +; CHECK-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm2[1] +; CHECK-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; CHECK-NEXT: xorpd %xmm1, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} entry: %B = lshr <2 x i64> %A, < i64 8, i64 1>