Index: llvm/lib/CodeGen/TwoAddressInstructionPass.cpp =================================================================== --- llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -373,19 +373,25 @@ return false; } -/// Given a register, if has a single in-basic block use, return the use -/// instruction if it's a copy or a two-address use. +/// Given a register, if all its uses are in the same basic block, return the +/// last use instruction if it's a copy or a two-address use. static MachineInstr * findOnlyInterestingUse(Register Reg, MachineBasicBlock *MBB, MachineRegisterInfo *MRI, const TargetInstrInfo *TII, - bool &IsCopy, Register &DstReg, bool &IsDstPhys) { - if (!MRI->hasOneNonDBGUse(Reg)) - // None or more than one use. - return nullptr; - MachineOperand &UseOp = *MRI->use_nodbg_begin(Reg); - MachineInstr &UseMI = *UseOp.getParent(); - if (UseMI.getParent() != MBB) + bool &IsCopy, Register &DstReg, bool &IsDstPhys, + LiveIntervals *LIS) { + MachineOperand *UseOp = nullptr; + for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) { + MachineInstr *MI = MO.getParent(); + if (MI->getParent() != MBB) + return nullptr; + if (isPlainlyKilled(MI, Reg, LIS)) + UseOp = &MO; + } + if (!UseOp) return nullptr; + MachineInstr &UseMI = *UseOp->getParent(); + Register SrcReg; bool IsSrcPhys; if (isCopyToReg(UseMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys)) { @@ -399,7 +405,7 @@ } if (UseMI.isCommutable()) { unsigned Src1 = TargetInstrInfo::CommuteAnyOperandIndex; - unsigned Src2 = UseMI.getOperandNo(&UseOp); + unsigned Src2 = UseMI.getOperandNo(UseOp); if (TII->findCommutedOpIndices(UseMI, Src1, Src2)) { MachineOperand &MO = UseMI.getOperand(Src1); if (MO.isReg() && MO.isUse() && @@ -744,7 +750,7 @@ Register NewReg; Register Reg = DstReg; while (MachineInstr *UseMI = findOnlyInterestingUse(Reg, MBB, MRI, TII,IsCopy, - NewReg, IsDstPhys)) { + NewReg, IsDstPhys, LIS)) { if (IsCopy && !Processed.insert(UseMI).second) break; Index: llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll =================================================================== --- llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll +++ llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll @@ -32,8 +32,8 @@ ; THUMB6-NEXT: uxtb r1, r1 ; THUMB6-NEXT: lsls r0, r1 ; THUMB6-NEXT: movs r1, #128 -; THUMB6-NEXT: ands r1, r0 -; THUMB6-NEXT: rsbs r0, r1, #0 +; THUMB6-NEXT: ands r0, r1 +; THUMB6-NEXT: rsbs r1, r0, #0 ; THUMB6-NEXT: adcs r0, r1 ; THUMB6-NEXT: bx lr ; @@ -64,8 +64,8 @@ ; THUMB6-NEXT: uxtb r1, r1 ; THUMB6-NEXT: lsls r0, r1 ; THUMB6-NEXT: movs r1, #1 -; THUMB6-NEXT: ands r1, r0 -; THUMB6-NEXT: rsbs r0, r1, #0 +; THUMB6-NEXT: ands r0, r1 +; THUMB6-NEXT: rsbs r1, r0, #0 ; THUMB6-NEXT: adcs r0, r1 ; THUMB6-NEXT: bx lr ; @@ -97,8 +97,8 @@ ; THUMB6-NEXT: uxtb r1, r1 ; THUMB6-NEXT: lsls r0, r1 ; THUMB6-NEXT: movs r1, #24 -; THUMB6-NEXT: ands r1, r0 -; THUMB6-NEXT: rsbs r0, r1, #0 +; THUMB6-NEXT: ands r0, r1 +; THUMB6-NEXT: rsbs r1, r0, #0 ; THUMB6-NEXT: adcs r0, r1 ; THUMB6-NEXT: bx lr ; @@ -134,8 +134,8 @@ ; THUMB6-NEXT: lsls r0, r1 ; THUMB6-NEXT: movs r1, #1 ; THUMB6-NEXT: lsls r1, r1, #15 -; THUMB6-NEXT: ands r1, r0 -; THUMB6-NEXT: rsbs r0, r1, #0 +; THUMB6-NEXT: ands r0, r1 +; THUMB6-NEXT: rsbs r1, r0, #0 ; THUMB6-NEXT: adcs r0, r1 ; THUMB6-NEXT: bx lr ; @@ -166,8 +166,8 @@ ; THUMB6-NEXT: uxth r1, r1 ; THUMB6-NEXT: lsls r0, r1 ; THUMB6-NEXT: movs r1, #1 -; THUMB6-NEXT: ands r1, r0 -; THUMB6-NEXT: rsbs r0, r1, #0 +; THUMB6-NEXT: ands r0, r1 +; THUMB6-NEXT: rsbs r1, r0, #0 ; THUMB6-NEXT: adcs r0, r1 ; THUMB6-NEXT: bx lr ; @@ -200,8 +200,8 @@ ; THUMB6-NEXT: lsls r0, r1 ; THUMB6-NEXT: movs r1, #255 ; THUMB6-NEXT: lsls r1, r1, #4 -; THUMB6-NEXT: ands r1, r0 -; THUMB6-NEXT: rsbs r0, r1, #0 +; THUMB6-NEXT: ands r0, r1 +; THUMB6-NEXT: rsbs r1, r0, #0 ; THUMB6-NEXT: adcs r0, r1 ; THUMB6-NEXT: bx lr ; @@ -233,8 +233,8 @@ ; THUMB6-NEXT: lsls r0, r1 ; THUMB6-NEXT: movs r1, #1 ; THUMB6-NEXT: lsls r1, r1, #31 -; THUMB6-NEXT: ands r1, r0 -; THUMB6-NEXT: rsbs r0, r1, #0 +; THUMB6-NEXT: ands r0, r1 +; THUMB6-NEXT: rsbs r1, r0, #0 ; THUMB6-NEXT: adcs r0, r1 ; THUMB6-NEXT: bx lr ; @@ -261,8 +261,8 @@ ; THUMB6: @ %bb.0: ; THUMB6-NEXT: lsls r0, r1 ; THUMB6-NEXT: movs r1, #1 -; THUMB6-NEXT: ands r1, r0 -; THUMB6-NEXT: rsbs r0, r1, #0 +; THUMB6-NEXT: ands r0, r1 +; THUMB6-NEXT: rsbs r1, r0, #0 ; THUMB6-NEXT: adcs r0, r1 ; THUMB6-NEXT: bx lr ; @@ -301,8 +301,8 @@ ; THUMB6: @ %bb.0: ; THUMB6-NEXT: lsls r0, r1 ; THUMB6-NEXT: ldr r1, .LCPI8_0 -; THUMB6-NEXT: ands r1, r0 -; THUMB6-NEXT: rsbs r0, r1, #0 +; THUMB6-NEXT: ands r0, r1 +; THUMB6-NEXT: rsbs r1, r0, #0 ; THUMB6-NEXT: adcs r0, r1 ; THUMB6-NEXT: bx lr ; THUMB6-NEXT: .p2align 2 @@ -406,8 +406,8 @@ ; THUMB6-NEXT: push {r7, lr} ; THUMB6-NEXT: bl __ashldi3 ; THUMB6-NEXT: movs r1, #1 -; THUMB6-NEXT: ands r1, r0 -; THUMB6-NEXT: rsbs r0, r1, #0 +; THUMB6-NEXT: ands r0, r1 +; THUMB6-NEXT: rsbs r1, r0, #0 ; THUMB6-NEXT: adcs r0, r1 ; THUMB6-NEXT: pop {r7, pc} ; @@ -642,14 +642,14 @@ ; THUMB6-NEXT: ldr r4, [sp, #16] ; THUMB6-NEXT: lsls r2, r4 ; THUMB6-NEXT: ldr r4, .LCPI13_0 -; THUMB6-NEXT: ands r4, r2 -; THUMB6-NEXT: rsbs r2, r4, #0 +; THUMB6-NEXT: ands r2, r4 +; THUMB6-NEXT: rsbs r4, r2, #0 ; THUMB6-NEXT: adcs r2, r4 ; THUMB6-NEXT: ldr r4, [sp, #20] ; THUMB6-NEXT: lsls r3, r4 ; THUMB6-NEXT: lsls r4, r0, #31 -; THUMB6-NEXT: ands r4, r3 -; THUMB6-NEXT: rsbs r3, r4, #0 +; THUMB6-NEXT: ands r3, r4 +; THUMB6-NEXT: rsbs r4, r3, #0 ; THUMB6-NEXT: adcs r3, r4 ; THUMB6-NEXT: pop {r4, pc} ; THUMB6-NEXT: .p2align 2 Index: llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll =================================================================== --- llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll +++ llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll @@ -303,8 +303,8 @@ ; THUMB6-NEXT: lsrs r0, r1 ; THUMB6-NEXT: movs r1, #1 ; THUMB6-NEXT: lsls r1, r1, #31 -; THUMB6-NEXT: ands r1, r0 -; THUMB6-NEXT: rsbs r0, r1, #0 +; THUMB6-NEXT: ands r0, r1 +; THUMB6-NEXT: rsbs r1, r0, #0 ; THUMB6-NEXT: adcs r0, r1 ; THUMB6-NEXT: bx lr ; @@ -331,8 +331,8 @@ ; THUMB6: @ %bb.0: ; THUMB6-NEXT: lsrs r0, r1 ; THUMB6-NEXT: movs r1, #1 -; THUMB6-NEXT: ands r1, r0 -; THUMB6-NEXT: rsbs r0, r1, #0 +; THUMB6-NEXT: ands r0, r1 +; THUMB6-NEXT: rsbs r1, r0, #0 ; THUMB6-NEXT: adcs r0, r1 ; THUMB6-NEXT: bx lr ; @@ -371,8 +371,8 @@ ; THUMB6: @ %bb.0: ; THUMB6-NEXT: lsrs r0, r1 ; THUMB6-NEXT: ldr r1, .LCPI8_0 -; THUMB6-NEXT: ands r1, r0 -; THUMB6-NEXT: rsbs r0, r1, #0 +; THUMB6-NEXT: ands r0, r1 +; THUMB6-NEXT: rsbs r1, r0, #0 ; THUMB6-NEXT: adcs r0, r1 ; THUMB6-NEXT: bx lr ; THUMB6-NEXT: .p2align 2 @@ -459,8 +459,8 @@ ; THUMB6-NEXT: push {r7, lr} ; THUMB6-NEXT: bl __lshrdi3 ; THUMB6-NEXT: movs r1, #1 -; THUMB6-NEXT: ands r1, r0 -; THUMB6-NEXT: rsbs r0, r1, #0 +; THUMB6-NEXT: ands r0, r1 +; THUMB6-NEXT: rsbs r1, r0, #0 ; THUMB6-NEXT: adcs r0, r1 ; THUMB6-NEXT: pop {r7, pc} ; @@ -713,14 +713,14 @@ ; THUMB6-NEXT: ldr r4, [sp, #16] ; THUMB6-NEXT: lsrs r2, r4 ; THUMB6-NEXT: ldr r4, .LCPI13_0 -; THUMB6-NEXT: ands r4, r2 -; THUMB6-NEXT: rsbs r2, r4, #0 +; THUMB6-NEXT: ands r2, r4 +; THUMB6-NEXT: rsbs r4, r2, #0 ; THUMB6-NEXT: adcs r2, r4 ; THUMB6-NEXT: ldr r4, [sp, #20] ; THUMB6-NEXT: lsrs r3, r4 ; THUMB6-NEXT: lsls r4, r0, #31 -; THUMB6-NEXT: ands r4, r3 -; THUMB6-NEXT: rsbs r3, r4, #0 +; THUMB6-NEXT: ands r3, r4 +; THUMB6-NEXT: rsbs r4, r3, #0 ; THUMB6-NEXT: adcs r3, r4 ; THUMB6-NEXT: pop {r4, pc} ; THUMB6-NEXT: .p2align 2 Index: llvm/test/CodeGen/ARM/ssat.ll =================================================================== --- llvm/test/CodeGen/ARM/ssat.ll +++ llvm/test/CodeGen/ARM/ssat.ll @@ -21,11 +21,11 @@ ; V4T: @ %bb.0: @ %entry ; V4T-NEXT: ldr r1, .LCPI0_0 ; V4T-NEXT: cmp r0, r1 -; V4T-NEXT: movlt r1, r0 -; V4T-NEXT: mov r0, #1065353216 -; V4T-NEXT: orr r0, r0, #-1073741824 -; V4T-NEXT: cmn r1, #8388608 -; V4T-NEXT: movgt r0, r1 +; V4T-NEXT: movge r0, r1 +; V4T-NEXT: mov r1, #1065353216 +; V4T-NEXT: orr r1, r1, #-1073741824 +; V4T-NEXT: cmn r0, #8388608 +; V4T-NEXT: movle r0, r1 ; V4T-NEXT: bx lr ; V4T-NEXT: .p2align 2 ; V4T-NEXT: @ %bb.1: @@ -54,12 +54,12 @@ ; V4T-NEXT: orr r2, r2, #1792 ; V4T-NEXT: asr r1, r1, #16 ; V4T-NEXT: cmp r1, r2 -; V4T-NEXT: movlt r2, r0 -; V4T-NEXT: lsl r0, r2, #16 -; V4T-NEXT: asr r1, r0, #16 -; V4T-NEXT: ldr r0, .LCPI1_0 +; V4T-NEXT: movge r0, r2 +; V4T-NEXT: ldr r2, .LCPI1_0 +; V4T-NEXT: lsl r1, r0, #16 +; V4T-NEXT: asr r1, r1, #16 ; V4T-NEXT: cmn r1, #2048 -; V4T-NEXT: movgt r0, r2 +; V4T-NEXT: movle r0, r2 ; V4T-NEXT: bx lr ; V4T-NEXT: .p2align 2 ; V4T-NEXT: @ %bb.1: @@ -71,12 +71,12 @@ ; V6T2-NEXT: sxth r1, r0 ; V6T2-NEXT: movw r2, #2047 ; V6T2-NEXT: cmp r1, r2 -; V6T2-NEXT: movlt r2, r0 -; V6T2-NEXT: movw r0, #63488 -; V6T2-NEXT: sxth r1, r2 -; V6T2-NEXT: movt r0, #65535 +; V6T2-NEXT: movge r0, r2 +; V6T2-NEXT: movw r2, #63488 +; V6T2-NEXT: sxth r1, r0 +; V6T2-NEXT: movt r2, #65535 ; V6T2-NEXT: cmn r1, #2048 -; V6T2-NEXT: movgt r0, r2 +; V6T2-NEXT: movle r0, r2 ; V6T2-NEXT: bx lr entry: %0 = icmp slt i16 %x, 2047 @@ -130,11 +130,11 @@ ; V4T: @ %bb.0: @ %entry ; V4T-NEXT: ldr r1, .LCPI3_0 ; V4T-NEXT: cmp r0, r1 -; V4T-NEXT: movlt r1, r0 -; V4T-NEXT: mov r0, #1065353216 -; V4T-NEXT: orr r0, r0, #-1073741824 -; V4T-NEXT: cmn r1, #8388608 -; V4T-NEXT: movgt r0, r1 +; V4T-NEXT: movge r0, r1 +; V4T-NEXT: mov r1, #1065353216 +; V4T-NEXT: orr r1, r1, #-1073741824 +; V4T-NEXT: cmn r0, #8388608 +; V4T-NEXT: movle r0, r1 ; V4T-NEXT: bx lr ; V4T-NEXT: .p2align 2 ; V4T-NEXT: @ %bb.1: @@ -159,11 +159,11 @@ ; V4T: @ %bb.0: @ %entry ; V4T-NEXT: ldr r1, .LCPI4_0 ; V4T-NEXT: cmp r0, r1 -; V4T-NEXT: movlt r1, r0 -; V4T-NEXT: mov r0, #1065353216 -; V4T-NEXT: orr r0, r0, #-1073741824 -; V4T-NEXT: cmn r1, #8388608 -; V4T-NEXT: movgt r0, r1 +; V4T-NEXT: movge r0, r1 +; V4T-NEXT: mov r1, #1065353216 +; V4T-NEXT: orr r1, r1, #-1073741824 +; V4T-NEXT: cmn r0, #8388608 +; V4T-NEXT: movle r0, r1 ; V4T-NEXT: bx lr ; V4T-NEXT: .p2align 2 ; V4T-NEXT: @ %bb.1: @@ -189,11 +189,10 @@ ; V4T-NEXT: mov r1, #1065353216 ; V4T-NEXT: cmn r0, #8388608 ; V4T-NEXT: orr r1, r1, #-1073741824 -; V4T-NEXT: movgt r1, r0 -; V4T-NEXT: ldr r0, .LCPI5_0 -; V4T-NEXT: cmp r1, r0 -; V4T-NEXT: movge r1, r0 -; V4T-NEXT: mov r0, r1 +; V4T-NEXT: movle r0, r1 +; V4T-NEXT: ldr r1, .LCPI5_0 +; V4T-NEXT: cmp r0, r1 +; V4T-NEXT: movge r0, r1 ; V4T-NEXT: bx lr ; V4T-NEXT: .p2align 2 ; V4T-NEXT: @ %bb.1: @@ -219,11 +218,10 @@ ; V4T-NEXT: mov r1, #1065353216 ; V4T-NEXT: cmn r0, #8388608 ; V4T-NEXT: orr r1, r1, #-1073741824 -; V4T-NEXT: movgt r1, r0 -; V4T-NEXT: ldr r0, .LCPI6_0 -; V4T-NEXT: cmp r1, r0 -; V4T-NEXT: movge r1, r0 -; V4T-NEXT: mov r0, r1 +; V4T-NEXT: movle r0, r1 +; V4T-NEXT: ldr r1, .LCPI6_0 +; V4T-NEXT: cmp r0, r1 +; V4T-NEXT: movge r0, r1 ; V4T-NEXT: bx lr ; V4T-NEXT: .p2align 2 ; V4T-NEXT: @ %bb.1: @@ -249,11 +247,10 @@ ; V4T-NEXT: mov r1, #1065353216 ; V4T-NEXT: cmn r0, #8388608 ; V4T-NEXT: orr r1, r1, #-1073741824 -; V4T-NEXT: movgt r1, r0 -; V4T-NEXT: ldr r0, .LCPI7_0 -; V4T-NEXT: cmp r1, r0 -; V4T-NEXT: movge r1, r0 -; V4T-NEXT: mov r0, r1 +; V4T-NEXT: movle r0, r1 +; V4T-NEXT: ldr r1, .LCPI7_0 +; V4T-NEXT: cmp r0, r1 +; V4T-NEXT: movge r0, r1 ; V4T-NEXT: bx lr ; V4T-NEXT: .p2align 2 ; V4T-NEXT: @ %bb.1: @@ -284,11 +281,10 @@ ; V4T-NEXT: mov r1, #1065353216 ; V4T-NEXT: cmn r0, #8388608 ; V4T-NEXT: orr r1, r1, #-1073741824 -; V4T-NEXT: movgt r1, r0 -; V4T-NEXT: ldr r0, .LCPI8_0 -; V4T-NEXT: cmp r1, r0 -; V4T-NEXT: movge r1, r0 -; V4T-NEXT: mov r0, r1 +; V4T-NEXT: movle r0, r1 +; V4T-NEXT: ldr r1, .LCPI8_0 +; V4T-NEXT: cmp r0, r1 +; V4T-NEXT: movge r0, r1 ; V4T-NEXT: bx lr ; V4T-NEXT: .p2align 2 ; V4T-NEXT: @ %bb.1: @@ -435,11 +431,10 @@ ; V4T: @ %bb.0: @ %entry ; V4T-NEXT: ldr r1, .LCPI12_0 ; V4T-NEXT: cmp r0, r1 -; V4T-NEXT: movgt r1, r0 -; V4T-NEXT: ldr r0, .LCPI12_1 -; V4T-NEXT: cmp r1, r0 -; V4T-NEXT: movge r1, r0 -; V4T-NEXT: mov r0, r1 +; V4T-NEXT: movle r0, r1 +; V4T-NEXT: ldr r1, .LCPI12_1 +; V4T-NEXT: cmp r0, r1 +; V4T-NEXT: movge r0, r1 ; V4T-NEXT: bx lr ; V4T-NEXT: .p2align 2 ; V4T-NEXT: @ %bb.1: @@ -453,12 +448,11 @@ ; V6T2-NEXT: movw r1, #47768 ; V6T2-NEXT: movt r1, #65244 ; V6T2-NEXT: cmp r0, r1 -; V6T2-NEXT: movgt r1, r0 -; V6T2-NEXT: movw r0, #65535 -; V6T2-NEXT: movt r0, #127 -; V6T2-NEXT: cmp r1, r0 -; V6T2-NEXT: movge r1, r0 -; V6T2-NEXT: mov r0, r1 +; V6T2-NEXT: movle r0, r1 +; V6T2-NEXT: movw r1, #65535 +; V6T2-NEXT: movt r1, #127 +; V6T2-NEXT: cmp r0, r1 +; V6T2-NEXT: movge r0, r1 ; V6T2-NEXT: bx lr entry: %0 = icmp sgt i32 %x, -19088744 Index: llvm/test/CodeGen/ARM/usat.ll =================================================================== --- llvm/test/CodeGen/ARM/usat.ll +++ llvm/test/CodeGen/ARM/usat.ll @@ -52,42 +52,39 @@ define i16 @unsigned_sat_base_16bit(i16 %x) #0 { ; V4T-LABEL: unsigned_sat_base_16bit: ; V4T: @ %bb.0: @ %entry +; V4T-NEXT: mov r2, #255 ; V4T-NEXT: lsl r1, r0, #16 -; V4T-NEXT: asr r2, r1, #16 -; V4T-NEXT: mov r1, #255 -; V4T-NEXT: orr r1, r1, #1792 -; V4T-NEXT: cmp r2, r1 -; V4T-NEXT: movlt r1, r0 -; V4T-NEXT: lsl r0, r1, #16 -; V4T-NEXT: asr r0, r0, #16 -; V4T-NEXT: cmp r0, #0 -; V4T-NEXT: movle r1, #0 -; V4T-NEXT: mov r0, r1 +; V4T-NEXT: orr r2, r2, #1792 +; V4T-NEXT: asr r1, r1, #16 +; V4T-NEXT: cmp r1, r2 +; V4T-NEXT: movge r0, r2 +; V4T-NEXT: lsl r1, r0, #16 +; V4T-NEXT: asr r1, r1, #16 +; V4T-NEXT: cmp r1, #0 +; V4T-NEXT: movle r0, #0 ; V4T-NEXT: bx lr ; ; V6-LABEL: unsigned_sat_base_16bit: ; V6: @ %bb.0: @ %entry -; V6-NEXT: mov r1, #255 -; V6-NEXT: sxth r2, r0 -; V6-NEXT: orr r1, r1, #1792 -; V6-NEXT: cmp r2, r1 -; V6-NEXT: movlt r1, r0 -; V6-NEXT: sxth r0, r1 -; V6-NEXT: cmp r0, #0 -; V6-NEXT: movle r1, #0 -; V6-NEXT: mov r0, r1 +; V6-NEXT: mov r2, #255 +; V6-NEXT: sxth r1, r0 +; V6-NEXT: orr r2, r2, #1792 +; V6-NEXT: cmp r1, r2 +; V6-NEXT: movge r0, r2 +; V6-NEXT: sxth r1, r0 +; V6-NEXT: cmp r1, #0 +; V6-NEXT: movle r0, #0 ; V6-NEXT: bx lr ; ; V6T2-LABEL: unsigned_sat_base_16bit: ; V6T2: @ %bb.0: @ %entry -; V6T2-NEXT: sxth r2, r0 -; V6T2-NEXT: movw r1, #2047 -; V6T2-NEXT: cmp r2, r1 -; V6T2-NEXT: movlt r1, r0 -; V6T2-NEXT: sxth r0, r1 -; V6T2-NEXT: cmp r0, #0 -; V6T2-NEXT: movle r1, #0 -; V6T2-NEXT: mov r0, r1 +; V6T2-NEXT: sxth r1, r0 +; V6T2-NEXT: movw r2, #2047 +; V6T2-NEXT: cmp r1, r2 +; V6T2-NEXT: movge r0, r2 +; V6T2-NEXT: sxth r1, r0 +; V6T2-NEXT: cmp r1, #0 +; V6T2-NEXT: movle r0, #0 ; V6T2-NEXT: bx lr entry: %0 = icmp slt i16 %x, 2047 Index: llvm/test/CodeGen/SystemZ/int-div-01.ll =================================================================== --- llvm/test/CodeGen/SystemZ/int-div-01.ll +++ llvm/test/CodeGen/SystemZ/int-div-01.ll @@ -51,7 +51,8 @@ ; CHECK-NOT: {{%r[234]}} ; CHECK: dsgfr %r2, %r4 ; CHECK-NOT: dsgfr -; CHECK: or %r2, %r3 +; CHECK: or %r3, %r2 +; CHECK: lr %r2, %r3 ; CHECK: br %r14 %div = sdiv i32 %a, %b %rem = srem i32 %a, %b Index: llvm/test/CodeGen/SystemZ/int-div-03.ll =================================================================== --- llvm/test/CodeGen/SystemZ/int-div-03.ll +++ llvm/test/CodeGen/SystemZ/int-div-03.ll @@ -36,7 +36,8 @@ ; CHECK-LABEL: f3: ; CHECK-NOT: {{%r[234]}} ; CHECK: dsgfr %r2, %r4 -; CHECK: ogr %r2, %r3 +; CHECK: ogr %r3, %r2 +; CHECK: lgr %r2, %r3 ; CHECK: br %r14 %bext = sext i32 %b to i64 %div = sdiv i64 %a, %bext @@ -102,7 +103,8 @@ ; CHECK-NOT: {{%r[234]}} ; CHECK: dsgf %r2, 0(%r4) ; CHECK-NOT: {{dsgf|dsgfr}} -; CHECK: ogr %r2, %r3 +; CHECK: ogr %r3, %r2 +; CHECK: lgr %r2, %r3 ; CHECK: br %r14 %b = load i32, i32 *%src %bext = sext i32 %b to i64 Index: llvm/test/CodeGen/SystemZ/int-div-04.ll =================================================================== --- llvm/test/CodeGen/SystemZ/int-div-04.ll +++ llvm/test/CodeGen/SystemZ/int-div-04.ll @@ -34,7 +34,8 @@ ; CHECK-NOT: {{%r[234]}} ; CHECK: dsgr %r2, %r4 ; CHECK-NOT: dsgr -; CHECK: ogr %r2, %r3 +; CHECK: ogr %r3, %r2 +; CHECK: lgr %r2, %r3 ; CHECK: br %r14 %div = sdiv i64 %a, %b %rem = srem i64 %a, %b @@ -74,7 +75,8 @@ ; CHECK-NOT: {{%r[234]}} ; CHECK: dsg %r2, 0(%r4) ; CHECK-NOT: {{dsg|dsgr}} -; CHECK: ogr %r2, %r3 +; CHECK: ogr %r3, %r2 +; CHECK: lgr %r2, %r3 ; CHECK: br %r14 %b = load i64, i64 *%src %div = sdiv i64 %a, %b Index: llvm/test/CodeGen/SystemZ/int-mul-08.ll =================================================================== --- llvm/test/CodeGen/SystemZ/int-mul-08.ll +++ llvm/test/CodeGen/SystemZ/int-mul-08.ll @@ -60,7 +60,8 @@ ; CHECK-LABEL: f4: ; CHECK-NOT: {{%r[234]}} ; CHECK: mlgr %r2, %r4 -; CHECK: ogr %r2, %r3 +; CHECK: ogr %r3, %r2 +; CHECK: lgr %r2, %r3 ; CHECK: br %r14 %ax = zext i64 %a to i128 %bx = zext i64 %b to i128 Index: llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll =================================================================== --- llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll +++ llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll @@ -117,10 +117,10 @@ ; CHECK-NEXT: mvns r3, r7 ; CHECK-NEXT: ldr r0, [sp, #32] ; CHECK-NEXT: bl __aeabi_ldivmod -; CHECK-NEXT: ands r3, r5 +; CHECK-NEXT: ands r5, r3 ; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: eors r2, r0 -; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: orrs r2, r5 ; CHECK-NEXT: subs r0, r2, #1 ; CHECK-NEXT: sbcs r2, r0 ; CHECK-NEXT: movs r0, r4 Index: llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll =================================================================== --- llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll +++ llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll @@ -284,9 +284,8 @@ define i64 @t5_cse(i64 %val, i64 %shamt, i64*%dst) nounwind { ; X64-NOBMI2-LABEL: t5_cse: ; X64-NOBMI2: # %bb.0: -; X64-NOBMI2-NEXT: movq %rsi, %rcx ; X64-NOBMI2-NEXT: movq %rdi, %rax -; X64-NOBMI2-NEXT: addq $32, %rcx +; X64-NOBMI2-NEXT: leaq 32(%rsi), %rcx ; X64-NOBMI2-NEXT: movq %rcx, (%rdx) ; X64-NOBMI2-NEXT: negq %rcx ; X64-NOBMI2-NEXT: # kill: def $cl killed $cl killed $rcx Index: llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll =================================================================== --- llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll +++ llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll @@ -61,11 +61,13 @@ ; ; X86_64-LABEL: t0: ; X86_64: # %bb.0: -; X86_64-NEXT: movl %esi, %eax -; X86_64-NEXT: addl %edx, %edi -; X86_64-NEXT: addl %edx, %eax -; X86_64-NEXT: cmpb %al, %dil -; X86_64-NEXT: cmovgl %edi, %eax +; X86_64-NEXT: # kill: def $edx killed $edx def $rdx +; X86_64-NEXT: # kill: def $esi killed $esi def $rsi +; X86_64-NEXT: # kill: def $edi killed $edi def $rdi +; X86_64-NEXT: leal (%rdi,%rdx), %ecx +; X86_64-NEXT: leal (%rsi,%rdx), %eax +; X86_64-NEXT: cmpb %al, %cl +; X86_64-NEXT: cmovgl %ecx, %eax ; X86_64-NEXT: # kill: def $al killed $al killed $eax ; X86_64-NEXT: retq %a1_wide = add i32 %a1_wide_orig, %inc @@ -134,11 +136,13 @@ ; ; X86_64-LABEL: neg_only_one_truncation: ; X86_64: # %bb.0: -; X86_64-NEXT: addl %edx, %edi +; X86_64-NEXT: # kill: def $edx killed $edx def $rdx +; X86_64-NEXT: # kill: def $edi killed $edi def $rdi +; X86_64-NEXT: leal (%rdi,%rdx), %ecx ; X86_64-NEXT: addb %sil, %dl -; X86_64-NEXT: cmpb %dl, %dil +; X86_64-NEXT: cmpb %dl, %cl ; X86_64-NEXT: movzbl %dl, %eax -; X86_64-NEXT: cmovgl %edi, %eax +; X86_64-NEXT: cmovgl %ecx, %eax ; X86_64-NEXT: # kill: def $al killed $al killed $eax ; X86_64-NEXT: retq %a1_wide = add i32 %a1_wide_orig, %inc @@ -205,11 +209,13 @@ ; ; X86_64-LABEL: neg_type_mismatch: ; X86_64: # %bb.0: -; X86_64-NEXT: movl %esi, %eax -; X86_64-NEXT: addl %edx, %edi -; X86_64-NEXT: addl %edx, %eax -; X86_64-NEXT: cmpb %al, %dil -; X86_64-NEXT: cmovgl %edi, %eax +; X86_64-NEXT: # kill: def $edx killed $edx def $rdx +; X86_64-NEXT: # kill: def $esi killed $esi def $rsi +; X86_64-NEXT: # kill: def $edi killed $edi def $rdi +; X86_64-NEXT: leal (%rdi,%rdx), %ecx +; X86_64-NEXT: leal (%rsi,%rdx), %eax +; X86_64-NEXT: cmpb %al, %cl +; X86_64-NEXT: cmovgl %ecx, %eax ; X86_64-NEXT: # kill: def $al killed $al killed $eax ; X86_64-NEXT: retq %a1_wide = add i32 %a1_wide_orig, %inc @@ -271,8 +277,9 @@ ; ; X86_64-LABEL: negative_CopyFromReg: ; X86_64: # %bb.0: -; X86_64-NEXT: movl %esi, %eax -; X86_64-NEXT: addl %edx, %eax +; X86_64-NEXT: # kill: def $edx killed $edx def $rdx +; X86_64-NEXT: # kill: def $esi killed $esi def $rsi +; X86_64-NEXT: leal (%rsi,%rdx), %eax ; X86_64-NEXT: cmpb %al, %dil ; X86_64-NEXT: cmovgl %edi, %eax ; X86_64-NEXT: # kill: def $al killed $al killed $eax Index: llvm/test/CodeGen/X86/atomic-unordered.ll =================================================================== --- llvm/test/CodeGen/X86/atomic-unordered.ll +++ llvm/test/CodeGen/X86/atomic-unordered.ll @@ -734,11 +734,12 @@ ; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 ; CHECK-O3-NEXT: movq %rcx, %rax ; CHECK-O3-NEXT: imulq %rdx -; CHECK-O3-NEXT: addq %rcx, %rdx -; CHECK-O3-NEXT: movq %rdx, %rax +; CHECK-O3-NEXT: addq %rdx, %rcx +; CHECK-O3-NEXT: movq %rcx, %rax ; CHECK-O3-NEXT: shrq $63, %rax -; CHECK-O3-NEXT: sarq $3, %rdx -; CHECK-O3-NEXT: addq %rdx, %rax +; CHECK-O3-NEXT: sarq $3, %rcx +; CHECK-O3-NEXT: addq %rax, %rcx +; CHECK-O3-NEXT: movq %rcx, %rax ; CHECK-O3-NEXT: retq %v = load atomic i64, i64* %p unordered, align 8 %ret = sdiv i64 %v, 15 Index: llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -393,10 +393,10 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] -; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] +; CHECK-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> ret <16 x i16> %res @@ -419,11 +419,11 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermt2w %ymm2, %ymm3, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -2248,11 +2248,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,4] +; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [2,0,3,4] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vmovdqa %ymm2, %ymm0 +; CHECK-NEXT: vpermt2q %ymm2, %ymm3, %ymm0 {%k1} {z} +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -4029,12 +4029,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [1,1,5,5] +; CHECK-FAST-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [1,1,5,5] ; CHECK-FAST-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-FAST-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 -; CHECK-FAST-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z} -; CHECK-FAST-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-FAST-NEXT: vpermt2pd %ymm2, %ymm3, %ymm0 {%k1} {z} +; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4: Index: llvm/test/CodeGen/X86/bitreverse.ll =================================================================== --- llvm/test/CodeGen/X86/bitreverse.ll +++ llvm/test/CodeGen/X86/bitreverse.ll @@ -372,13 +372,13 @@ ; X64-NEXT: shlb $2, %al ; X64-NEXT: shrb $2, %dil ; X64-NEXT: andb $51, %dil -; X64-NEXT: orb %al, %dil -; X64-NEXT: movl %edi, %eax -; X64-NEXT: andb $85, %al -; X64-NEXT: addb %al, %al -; X64-NEXT: shrb %dil -; X64-NEXT: andb $85, %dil ; X64-NEXT: orb %dil, %al +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: andb $85, %cl +; X64-NEXT: addb %cl, %cl +; X64-NEXT: shrb %al +; X64-NEXT: andb $85, %al +; X64-NEXT: orb %cl, %al ; X64-NEXT: retq ; ; X86XOP-LABEL: test_bitreverse_i8: @@ -422,13 +422,13 @@ ; X64-NEXT: shlb $2, %al ; X64-NEXT: shrb $2, %dil ; X64-NEXT: andb $51, %dil -; X64-NEXT: orb %al, %dil -; X64-NEXT: movl %edi, %eax -; X64-NEXT: andb $80, %al -; X64-NEXT: addb %al, %al -; X64-NEXT: shrb %dil -; X64-NEXT: andb $80, %dil ; X64-NEXT: orb %dil, %al +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: andb $80, %cl +; X64-NEXT: addb %cl, %cl +; X64-NEXT: shrb %al +; X64-NEXT: andb $80, %al +; X64-NEXT: orb %cl, %al ; X64-NEXT: shrb $4, %al ; X64-NEXT: retq ; Index: llvm/test/CodeGen/X86/bmi2.ll =================================================================== --- llvm/test/CodeGen/X86/bmi2.ll +++ llvm/test/CodeGen/X86/bmi2.ll @@ -305,10 +305,10 @@ ; ; X64-LABEL: mulx32: ; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax +; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: addl %edi, %edi -; X64-NEXT: addl %eax, %eax +; X64-NEXT: leal (%rsi,%rsi), %eax ; X64-NEXT: imulq %rdi, %rax ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: shrq $32, %rcx @@ -340,8 +340,8 @@ ; ; X64-LABEL: mulx32_load: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: addl %eax, %eax +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal (%rdi,%rdi), %eax ; X64-NEXT: movl (%rsi), %ecx ; X64-NEXT: imulq %rcx, %rax ; X64-NEXT: movq %rax, %rcx Index: llvm/test/CodeGen/X86/bypass-slow-division-32.ll =================================================================== --- llvm/test/CodeGen/X86/bypass-slow-division-32.ll +++ llvm/test/CodeGen/X86/bypass-slow-division-32.ll @@ -175,12 +175,12 @@ ; CHECK-NEXT: movl %edx, %eax ; CHECK-NEXT: shrl $31, %eax ; CHECK-NEXT: sarl $3, %edx -; CHECK-NEXT: addl %eax, %edx -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: shll $5, %eax ; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: subl %eax, %ecx -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: shll $5, %edx +; CHECK-NEXT: addl %eax, %edx +; CHECK-NEXT: subl %edx, %ecx +; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: retl %resultdiv = sdiv i32 %a, 33 Index: llvm/test/CodeGen/X86/combine-bitselect.ll =================================================================== --- llvm/test/CodeGen/X86/combine-bitselect.ll +++ llvm/test/CodeGen/X86/combine-bitselect.ll @@ -979,14 +979,13 @@ ; SSE-LABEL: bitselect_v4i1_loop: ; SSE: # %bb.0: # %bb ; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [12,12,12,12] -; SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [12,12,12,12] +; SSE-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; ; XOP-LABEL: bitselect_v4i1_loop: Index: llvm/test/CodeGen/X86/combine-sdiv.ll =================================================================== --- llvm/test/CodeGen/X86/combine-sdiv.ll +++ llvm/test/CodeGen/X86/combine-sdiv.ll @@ -1613,15 +1613,14 @@ ; SSE2-NEXT: psrlq $61, %xmm3 ; SSE2-NEXT: psrlq $60, %xmm2 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; SSE2-NEXT: paddq %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: psrlq $3, %xmm1 -; SSE2-NEXT: psrlq $4, %xmm2 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE2-NEXT: movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488] -; SSE2-NEXT: xorpd %xmm1, %xmm2 -; SSE2-NEXT: psubq %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: paddq %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlq $3, %xmm2 +; SSE2-NEXT: psrlq $4, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE2-NEXT: movapd {{.*#+}} xmm2 = [1152921504606846976,576460752303423488] +; SSE2-NEXT: xorpd %xmm2, %xmm1 +; SSE2-NEXT: psubq %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64: @@ -1642,15 +1641,14 @@ ; SSE41-NEXT: psrlq $60, %xmm3 ; SSE41-NEXT: psrlq $61, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: paddq %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psrlq $4, %xmm1 -; SSE41-NEXT: psrlq $3, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1152921504606846976,576460752303423488] -; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: psubq %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: paddq %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlq $4, %xmm2 +; SSE41-NEXT: psrlq $3, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1152921504606846976,576460752303423488] +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: psubq %xmm2, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i64: @@ -1762,14 +1760,14 @@ ; SSE2-NEXT: psrlq $61, %xmm5 ; SSE2-NEXT: psrlq $60, %xmm4 ; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] -; SSE2-NEXT: paddq %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: psrlq $3, %xmm1 -; SSE2-NEXT: psrlq $4, %xmm4 -; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE2-NEXT: movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488] -; SSE2-NEXT: xorpd %xmm1, %xmm4 -; SSE2-NEXT: psubq %xmm1, %xmm4 +; SSE2-NEXT: paddq %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrlq $3, %xmm4 +; SSE2-NEXT: psrlq $4, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] +; SSE2-NEXT: movapd {{.*#+}} xmm4 = [1152921504606846976,576460752303423488] +; SSE2-NEXT: xorpd %xmm4, %xmm1 +; SSE2-NEXT: psubq %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: psrad $31, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] @@ -1777,68 +1775,64 @@ ; SSE2-NEXT: psrlq $61, %xmm6 ; SSE2-NEXT: psrlq $60, %xmm5 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] -; SSE2-NEXT: paddq %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: psrlq $3, %xmm3 -; SSE2-NEXT: psrlq $4, %xmm5 -; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE2-NEXT: xorpd %xmm1, %xmm5 -; SSE2-NEXT: psubq %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: paddq %xmm5, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: psrlq $3, %xmm5 +; SSE2-NEXT: psrlq $4, %xmm3 +; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] +; SSE2-NEXT: xorpd %xmm4, %xmm3 +; SSE2-NEXT: psubq %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: psrlq $62, %xmm1 -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: psrlq $62, %xmm4 +; SSE41-NEXT: paddq %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: psrad $2, %xmm5 -; SSE41-NEXT: psrlq $2, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: psrlq $62, %xmm1 -; SSE41-NEXT: paddq %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: psrlq $2, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: psrlq $62, %xmm4 +; SSE41-NEXT: paddq %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: psrad $2, %xmm5 -; SSE41-NEXT: psrlq $2, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: psrlq $2, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: psrlq $60, %xmm5 -; SSE41-NEXT: psrlq $61, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: psrlq $61, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: paddq %xmm4, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: psrlq $4, %xmm4 ; SSE41-NEXT: psrlq $3, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1152921504606846976,576460752303423488] -; SSE41-NEXT: pxor %xmm5, %xmm1 -; SSE41-NEXT: psubq %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psrad $31, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488] +; SSE41-NEXT: pxor %xmm4, %xmm1 +; SSE41-NEXT: psubq %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: psrad $31, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: movdqa %xmm5, %xmm6 ; SSE41-NEXT: psrlq $60, %xmm6 -; SSE41-NEXT: psrlq $61, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] -; SSE41-NEXT: paddq %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: psrlq $4, %xmm3 -; SSE41-NEXT: psrlq $3, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: psubq %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: psrlq $61, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] +; SSE41-NEXT: paddq %xmm5, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: psrlq $4, %xmm5 +; SSE41-NEXT: psrlq $3, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: psubq %xmm4, %xmm3 ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64: @@ -1968,29 +1962,28 @@ ; SSE2-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrld $28, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrld $29, %xmm3 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE2-NEXT: psrld $30, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrld $30, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrad $4, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrad $3, %xmm3 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE2-NEXT: psrad $2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3] -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm0[2,3] -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: psubd %xmm2, %xmm3 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: psrad $2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: psubd %xmm1, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: @@ -2340,10 +2333,10 @@ ; SSE41-NEXT: psrld $31, %xmm1 ; SSE41-NEXT: paddd %xmm0, %xmm1 ; SSE41-NEXT: psrad $1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: psubd %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: psubd %xmm0, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: non_splat_minus_one_divisor_2: @@ -2480,30 +2473,28 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833] ; SSE2-NEXT: pmulhw %xmm0, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psraw $4, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: paddw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psraw $4, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE2-NEXT: psrlw $15, %xmm1 -; SSE2-NEXT: paddw %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE2-NEXT: psrlw $15, %xmm0 +; SSE2-NEXT: paddw %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_sdiv_nonuniform3: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833] ; SSE41-NEXT: pmulhw %xmm0, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psraw $8, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: paddw %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psraw $8, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psraw $4, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: psrlw $15, %xmm1 -; SSE41-NEXT: paddw %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: psrlw $15, %xmm0 +; SSE41-NEXT: paddw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_nonuniform3: @@ -2918,26 +2909,25 @@ define <16 x i8> @pr38658(<16 x i8> %x) { ; SSE2-LABEL: pr38658: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: psrlw $8, %xmm3 ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: psrlw $7, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: packuswb %xmm2, %xmm3 +; SSE2-NEXT: paddb %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm2 +; SSE2-NEXT: psrlw $7, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: pr38658: @@ -2948,21 +2938,20 @@ ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: packuswb %xmm2, %xmm1 -; SSE41-NEXT: paddb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE41-NEXT: psraw $8, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: paddb %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE41-NEXT: psraw $8, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psllw $6, %xmm2 -; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: packuswb %xmm0, %xmm2 -; SSE41-NEXT: psrlw $7, %xmm1 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: paddb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psllw $8, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] +; SSE41-NEXT: psrlw $8, %xmm1 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: packuswb %xmm1, %xmm2 +; SSE41-NEXT: psrlw $7, %xmm0 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: paddb %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: pr38658: Index: llvm/test/CodeGen/X86/ctpop-combine.ll =================================================================== --- llvm/test/CodeGen/X86/ctpop-combine.ll +++ llvm/test/CodeGen/X86/ctpop-combine.ll @@ -93,14 +93,14 @@ ; NO-POPCOUNT-NEXT: shrb %al ; NO-POPCOUNT-NEXT: andb $21, %al ; NO-POPCOUNT-NEXT: subb %al, %dil -; NO-POPCOUNT-NEXT: movl %edi, %eax -; NO-POPCOUNT-NEXT: andb $51, %al +; NO-POPCOUNT-NEXT: movl %edi, %ecx +; NO-POPCOUNT-NEXT: andb $51, %cl ; NO-POPCOUNT-NEXT: shrb $2, %dil ; NO-POPCOUNT-NEXT: andb $51, %dil -; NO-POPCOUNT-NEXT: addb %al, %dil -; NO-POPCOUNT-NEXT: movl %edi, %eax +; NO-POPCOUNT-NEXT: addb %dil, %cl +; NO-POPCOUNT-NEXT: movl %ecx, %eax ; NO-POPCOUNT-NEXT: shrb $4, %al -; NO-POPCOUNT-NEXT: addb %dil, %al +; NO-POPCOUNT-NEXT: addb %cl, %al ; NO-POPCOUNT-NEXT: andb $15, %al ; NO-POPCOUNT-NEXT: retq %x2 = and i8 %x, 127 Index: llvm/test/CodeGen/X86/divide-by-constant.ll =================================================================== --- llvm/test/CodeGen/X86/divide-by-constant.ll +++ llvm/test/CodeGen/X86/divide-by-constant.ll @@ -442,10 +442,9 @@ ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: imulq %rcx ; X64-NEXT: movq %rdx, %rax -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: shrq $63, %rcx -; X64-NEXT: sarq $28, %rax -; X64-NEXT: addq %rcx, %rax +; X64-NEXT: shrq $63, %rax +; X64-NEXT: sarq $28, %rdx +; X64-NEXT: addq %rdx, %rax ; X64-NEXT: imull $-294967296, %eax, %ecx # imm = 0xEE6B2800 ; X64-NEXT: subl %ecx, %edi ; X64-NEXT: movl %edi, %edx Index: llvm/test/CodeGen/X86/fshr.ll =================================================================== --- llvm/test/CodeGen/X86/fshr.ll +++ llvm/test/CodeGen/X86/fshr.ll @@ -844,19 +844,19 @@ ; X64-FAST-NEXT: movl %r9d, %ecx ; X64-FAST-NEXT: shrdq %cl, %r8, %rdx ; X64-FAST-NEXT: shrq %cl, %r8 -; X64-FAST-NEXT: xorl %eax, %eax +; X64-FAST-NEXT: xorl %r10d, %r10d ; X64-FAST-NEXT: testb $64, %r9b ; X64-FAST-NEXT: cmovneq %r8, %rdx -; X64-FAST-NEXT: cmovneq %rax, %r8 +; X64-FAST-NEXT: cmovneq %r10, %r8 ; X64-FAST-NEXT: shldq $1, %rdi, %rsi -; X64-FAST-NEXT: addq %rdi, %rdi +; X64-FAST-NEXT: leaq (%rdi,%rdi), %rax ; X64-FAST-NEXT: notb %r9b ; X64-FAST-NEXT: movl %r9d, %ecx -; X64-FAST-NEXT: shldq %cl, %rdi, %rsi -; X64-FAST-NEXT: shlq %cl, %rdi +; X64-FAST-NEXT: shldq %cl, %rax, %rsi +; X64-FAST-NEXT: shlq %cl, %rax ; X64-FAST-NEXT: testb $64, %r9b -; X64-FAST-NEXT: cmovneq %rdi, %rsi -; X64-FAST-NEXT: cmoveq %rdi, %rax +; X64-FAST-NEXT: cmovneq %rax, %rsi +; X64-FAST-NEXT: cmovneq %r10, %rax ; X64-FAST-NEXT: orq %rdx, %rax ; X64-FAST-NEXT: orq %rsi, %r8 ; X64-FAST-NEXT: movq %r8, %rdx @@ -865,39 +865,38 @@ ; X64-SLOW-LABEL: var_shift_i128: ; X64-SLOW: # %bb.0: ; X64-SLOW-NEXT: movq %rcx, %r9 -; X64-SLOW-NEXT: movq %rdx, %r10 -; X64-SLOW-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF -; X64-SLOW-NEXT: andq %rdi, %rax +; X64-SLOW-NEXT: movabsq $9223372036854775807, %r10 # imm = 0x7FFFFFFFFFFFFFFF +; X64-SLOW-NEXT: andq %rdi, %r10 ; X64-SLOW-NEXT: movl %r8d, %ecx -; X64-SLOW-NEXT: shrq %cl, %rax -; X64-SLOW-NEXT: movq %rdi, %rcx -; X64-SLOW-NEXT: shrq $63, %rcx -; X64-SLOW-NEXT: leaq (%rcx,%rsi,2), %rdx +; X64-SLOW-NEXT: shrq %cl, %r10 +; X64-SLOW-NEXT: leaq (%rdi,%rdi), %rax +; X64-SLOW-NEXT: shrq $63, %rdi +; X64-SLOW-NEXT: leaq (%rdi,%rsi,2), %rsi ; X64-SLOW-NEXT: movl %r8d, %r11d ; X64-SLOW-NEXT: notb %r11b ; X64-SLOW-NEXT: movl %r11d, %ecx -; X64-SLOW-NEXT: shlq %cl, %rdx -; X64-SLOW-NEXT: orq %rax, %rdx -; X64-SLOW-NEXT: movl %r8d, %ecx -; X64-SLOW-NEXT: shrq %cl, %r10 -; X64-SLOW-NEXT: leaq (%r9,%r9), %rsi -; X64-SLOW-NEXT: movl %r11d, %ecx ; X64-SLOW-NEXT: shlq %cl, %rsi ; X64-SLOW-NEXT: orq %r10, %rsi ; X64-SLOW-NEXT: movl %r8d, %ecx +; X64-SLOW-NEXT: shrq %cl, %rdx +; X64-SLOW-NEXT: leaq (%r9,%r9), %rdi +; X64-SLOW-NEXT: movl %r11d, %ecx +; X64-SLOW-NEXT: shlq %cl, %rdi +; X64-SLOW-NEXT: orq %rdx, %rdi +; X64-SLOW-NEXT: movl %r8d, %ecx ; X64-SLOW-NEXT: shrq %cl, %r9 -; X64-SLOW-NEXT: xorl %eax, %eax +; X64-SLOW-NEXT: xorl %edx, %edx ; X64-SLOW-NEXT: testb $64, %r8b -; X64-SLOW-NEXT: cmovneq %r9, %rsi -; X64-SLOW-NEXT: cmovneq %rax, %r9 -; X64-SLOW-NEXT: addq %rdi, %rdi +; X64-SLOW-NEXT: cmovneq %r9, %rdi +; X64-SLOW-NEXT: cmovneq %rdx, %r9 ; X64-SLOW-NEXT: movl %r11d, %ecx -; X64-SLOW-NEXT: shlq %cl, %rdi +; X64-SLOW-NEXT: shlq %cl, %rax ; X64-SLOW-NEXT: testb $64, %r11b -; X64-SLOW-NEXT: cmovneq %rdi, %rdx -; X64-SLOW-NEXT: cmoveq %rdi, %rax -; X64-SLOW-NEXT: orq %rsi, %rax -; X64-SLOW-NEXT: orq %r9, %rdx +; X64-SLOW-NEXT: cmovneq %rax, %rsi +; X64-SLOW-NEXT: cmovneq %rdx, %rax +; X64-SLOW-NEXT: orq %rdi, %rax +; X64-SLOW-NEXT: orq %r9, %rsi +; X64-SLOW-NEXT: movq %rsi, %rdx ; X64-SLOW-NEXT: retq %tmp = tail call i128 @llvm.fshr.i128(i128 %x, i128 %y, i128 %z) ret i128 %tmp Index: llvm/test/CodeGen/X86/haddsub-3.ll =================================================================== --- llvm/test/CodeGen/X86/haddsub-3.ll +++ llvm/test/CodeGen/X86/haddsub-3.ll @@ -11,18 +11,18 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-SLOW-LABEL: pr26491: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; Index: llvm/test/CodeGen/X86/haddsub-shuf.ll =================================================================== --- llvm/test/CodeGen/X86/haddsub-shuf.ll +++ llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -479,8 +479,8 @@ ; SSE3-LABEL: hadd_v8i32a: ; SSE3: # %bb.0: ; SSE3-NEXT: movaps %xmm0, %xmm2 -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] +; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; SSE3-NEXT: paddd %xmm0, %xmm2 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] ; SSE3-NEXT: movdqa %xmm2, %xmm1 Index: llvm/test/CodeGen/X86/haddsub.ll =================================================================== --- llvm/test/CodeGen/X86/haddsub.ll +++ llvm/test/CodeGen/X86/haddsub.ll @@ -1777,8 +1777,8 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -1786,9 +1786,8 @@ ; SSE3-FAST: # %bb.0: ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-FAST-NEXT: addps %xmm0, %xmm1 -; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 -; SSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSE3-FAST-NEXT: addps %xmm1, %xmm0 +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: hadd32_4: @@ -1818,8 +1817,8 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -1827,9 +1826,8 @@ ; SSE3-FAST: # %bb.0: ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-FAST-NEXT: addps %xmm0, %xmm1 -; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 -; SSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSE3-FAST-NEXT: addps %xmm1, %xmm0 +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: hadd32_8: @@ -1861,8 +1859,8 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -1870,9 +1868,8 @@ ; SSE3-FAST: # %bb.0: ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-FAST-NEXT: addps %xmm0, %xmm1 -; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 -; SSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSE3-FAST-NEXT: addps %xmm1, %xmm0 +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: hadd32_16: @@ -1904,9 +1901,8 @@ ; SSE3: # %bb.0: ; SSE3-NEXT: movaps %xmm0, %xmm1 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-NEXT: addps %xmm0, %xmm1 -; SSE3-NEXT: haddps %xmm1, %xmm1 -; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: addps %xmm1, %xmm0 +; SSE3-NEXT: haddps %xmm0, %xmm0 ; SSE3-NEXT: retq ; ; AVX-LABEL: hadd32_4_optsize: @@ -1928,9 +1924,8 @@ ; SSE3: # %bb.0: ; SSE3-NEXT: movaps %xmm0, %xmm1 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-NEXT: addps %xmm0, %xmm1 -; SSE3-NEXT: haddps %xmm1, %xmm1 -; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: addps %xmm1, %xmm0 +; SSE3-NEXT: haddps %xmm0, %xmm0 ; SSE3-NEXT: retq ; ; AVX-LABEL: hadd32_8_optsize: @@ -1953,9 +1948,8 @@ ; SSE3: # %bb.0: ; SSE3-NEXT: movaps %xmm0, %xmm1 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-NEXT: addps %xmm0, %xmm1 -; SSE3-NEXT: haddps %xmm1, %xmm1 -; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: addps %xmm1, %xmm0 +; SSE3-NEXT: haddps %xmm0, %xmm0 ; SSE3-NEXT: retq ; ; AVX-LABEL: hadd32_16_optsize: @@ -1978,9 +1972,8 @@ ; SSE3: # %bb.0: ; SSE3-NEXT: movaps %xmm0, %xmm1 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-NEXT: addps %xmm0, %xmm1 -; SSE3-NEXT: haddps %xmm1, %xmm1 -; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: addps %xmm1, %xmm0 +; SSE3-NEXT: haddps %xmm0, %xmm0 ; SSE3-NEXT: retq ; ; AVX-LABEL: hadd32_4_pgso: @@ -2002,9 +1995,8 @@ ; SSE3: # %bb.0: ; SSE3-NEXT: movaps %xmm0, %xmm1 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-NEXT: addps %xmm0, %xmm1 -; SSE3-NEXT: haddps %xmm1, %xmm1 -; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: addps %xmm1, %xmm0 +; SSE3-NEXT: haddps %xmm0, %xmm0 ; SSE3-NEXT: retq ; ; AVX-LABEL: hadd32_8_pgso: @@ -2027,9 +2019,8 @@ ; SSE3: # %bb.0: ; SSE3-NEXT: movaps %xmm0, %xmm1 ; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-NEXT: addps %xmm0, %xmm1 -; SSE3-NEXT: haddps %xmm1, %xmm1 -; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: addps %xmm1, %xmm0 +; SSE3-NEXT: haddps %xmm0, %xmm0 ; SSE3-NEXT: retq ; ; AVX-LABEL: hadd32_16_pgso: @@ -2052,8 +2043,8 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -2061,9 +2052,8 @@ ; SSE3-FAST: # %bb.0: ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-FAST-NEXT: addps %xmm0, %xmm1 -; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 -; SSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSE3-FAST-NEXT: addps %xmm1, %xmm0 +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32: @@ -2097,8 +2087,8 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -2106,9 +2096,8 @@ ; SSE3-FAST: # %bb.0: ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-FAST-NEXT: addps %xmm0, %xmm1 -; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 -; SSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSE3-FAST-NEXT: addps %xmm1, %xmm0 +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags: @@ -2140,8 +2129,8 @@ ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: addps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -2149,9 +2138,8 @@ ; SSE3-FAST: # %bb.0: ; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 ; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-FAST-NEXT: addps %xmm0, %xmm1 -; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 -; SSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSE3-FAST-NEXT: addps %xmm1, %xmm0 +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: partial_reduction_fadd_v16f32: Index: llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll =================================================================== --- llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll +++ llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll @@ -13,9 +13,9 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -23,8 +23,8 @@ ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; @@ -127,9 +127,9 @@ ; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -138,8 +138,8 @@ ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; @@ -197,9 +197,9 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: addpd %xmm3, %xmm1 ; SSE2-NEXT: addpd %xmm2, %xmm1 -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE2-NEXT: addsd %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -207,9 +207,9 @@ ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: addpd %xmm3, %xmm1 ; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: addpd %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: movapd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSSE3-SLOW-NEXT: addpd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: movapd %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; @@ -217,9 +217,8 @@ ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: addpd %xmm3, %xmm1 ; SSSE3-FAST-NEXT: addpd %xmm2, %xmm1 -; SSSE3-FAST-NEXT: addpd %xmm0, %xmm1 -; SSSE3-FAST-NEXT: haddpd %xmm1, %xmm1 -; SSSE3-FAST-NEXT: movapd %xmm1, %xmm0 +; SSSE3-FAST-NEXT: addpd %xmm1, %xmm0 +; SSSE3-FAST-NEXT: haddpd %xmm0, %xmm0 ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: PR37890_v8f64: @@ -267,34 +266,34 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: addps %xmm3, %xmm1 ; SSE2-NEXT: addps %xmm2, %xmm1 -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE2-NEXT: addps %xmm1, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] -; SSE2-NEXT: addss %xmm2, %xmm0 +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-SLOW-LABEL: PR37890_v16f32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm2 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm2 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-SLOW-NEXT: addss %xmm2, %xmm0 +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: PR37890_v16f32: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: addps %xmm3, %xmm1 ; SSSE3-FAST-NEXT: addps %xmm2, %xmm1 -; SSSE3-FAST-NEXT: addps %xmm0, %xmm1 -; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 +; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSSE3-FAST-NEXT: retq Index: llvm/test/CodeGen/X86/horizontal-sum.ll =================================================================== --- llvm/test/CodeGen/X86/horizontal-sum.ll +++ llvm/test/CodeGen/X86/horizontal-sum.ll @@ -535,53 +535,53 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { ; SSSE3-SLOW-LABEL: sequential_sum_v4f32_v4f32: ; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 -; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm4 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5 -; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm5 +; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 +; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3] -; SSSE3-SLOW-NEXT: addps %xmm4, %xmm5 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] +; SSSE3-SLOW-NEXT: addps %xmm5, %xmm4 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] -; SSSE3-SLOW-NEXT: addps %xmm5, %xmm1 +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3] ; SSSE3-SLOW-NEXT: addps %xmm3, %xmm0 -; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSSE3-SLOW-NEXT: addps %xmm0, %xmm2 +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSSE3-SLOW-NEXT: addps %xmm2, %xmm3 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] -; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: sequential_sum_v4f32_v4f32: ; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 -; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5 -; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSSE3-FAST-NEXT: haddps %xmm1, %xmm5 +; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 +; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,1] -; SSSE3-FAST-NEXT: addps %xmm4, %xmm5 -; SSSE3-FAST-NEXT: addps %xmm5, %xmm1 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,1] +; SSSE3-FAST-NEXT: addps %xmm5, %xmm4 +; SSSE3-FAST-NEXT: addps %xmm1, %xmm4 ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm0 ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm0 -; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSSE3-FAST-NEXT: addps %xmm0, %xmm2 +; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSSE3-FAST-NEXT: addps %xmm0, %xmm1 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSSE3-FAST-NEXT: addps %xmm2, %xmm3 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] -; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSSE3-FAST-NEXT: addps %xmm1, %xmm3 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSSE3-FAST-NEXT: movaps %xmm4, %xmm0 ; SSSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: sequential_sum_v4f32_v4f32: @@ -961,13 +961,13 @@ ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSSE3-SLOW-NEXT: addps %xmm0, %xmm4 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm5 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 @@ -976,10 +976,10 @@ ; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2 ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm3 ; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 ; SSSE3-SLOW-NEXT: retq ; @@ -1050,12 +1050,12 @@ ; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] @@ -1063,12 +1063,11 @@ ; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] ; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] ; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 -; SSSE3-SLOW-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32: Index: llvm/test/CodeGen/X86/lzcnt-cmp.ll =================================================================== --- llvm/test/CodeGen/X86/lzcnt-cmp.ll +++ llvm/test/CodeGen/X86/lzcnt-cmp.ll @@ -194,11 +194,11 @@ ; X64-LABEL: lshr_ctlz_cmpeq_zero_v2i64: ; X64: # %bb.0: ; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: pcmpeqd %xmm0, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] -; X64-NEXT: pand %xmm1, %xmm2 -; X64-NEXT: pcmpeqd %xmm0, %xmm0 -; X64-NEXT: pxor %xmm2, %xmm0 +; X64-NEXT: pcmpeqd %xmm1, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X64-NEXT: pand %xmm1, %xmm0 +; X64-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: retq %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0) %lshr = lshr <2 x i64> %ctlz, @@ -232,8 +232,8 @@ ; X64-LABEL: lshr_ctlz_cmpne_zero_v2i64: ; X64: # %bb.0: ; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: pcmpeqd %xmm0, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] +; X64-NEXT: pcmpeqd %xmm1, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; X64-NEXT: pand %xmm1, %xmm0 ; X64-NEXT: retq %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0) Index: llvm/test/CodeGen/X86/nontemporal-loads.ll =================================================================== --- llvm/test/CodeGen/X86/nontemporal-loads.ll +++ llvm/test/CodeGen/X86/nontemporal-loads.ll @@ -1764,10 +1764,10 @@ ; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 ; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 ; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn (%rdi), %xmm4 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pandn (%rdi), %xmm8 +; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pandn 16(%rdi), %xmm5 ; SSE2-NEXT: por %xmm5, %xmm1 Index: llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll =================================================================== --- llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll +++ llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll @@ -160,10 +160,10 @@ ; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE4-NEXT: por %xmm2, %xmm1 -; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [1431655765,858993459,715827882,477218588] -; SSE4-NEXT: pminud %xmm1, %xmm0 +; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE4-NEXT: por %xmm2, %xmm0 +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,858993459,715827882,477218588] +; SSE4-NEXT: pminud %xmm0, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE4-NEXT: retq ; Index: llvm/test/CodeGen/X86/pmulh.ll =================================================================== --- llvm/test/CodeGen/X86/pmulh.ll +++ llvm/test/CodeGen/X86/pmulh.ll @@ -326,32 +326,32 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: and_mulhuw_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] -; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767,32767,32767] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm8, %xmm7 -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: packssdw %xmm7, %xmm6 -; SSE2-NEXT: pmulhw %xmm2, %xmm6 -; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: packssdw %xmm5, %xmm4 -; SSE2-NEXT: pmulhw %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: packssdw %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: packssdw %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm6, %xmm8 +; SSE2-NEXT: packssdw %xmm7, %xmm8 +; SSE2-NEXT: pmulhw %xmm2, %xmm8 +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: packssdw %xmm5, %xmm6 +; SSE2-NEXT: pmulhw %xmm6, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm8, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE2-NEXT: packssdw %xmm2, %xmm8 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: packssdw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm8, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: and_mulhuw_v16i16: @@ -825,13 +825,11 @@ define <8 x i32> @zext_mulhuw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: zext_mulhuw_v8i16_lshr: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pmulhuw %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pmulhuw %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: retq ; ; SSE41-LABEL: zext_mulhuw_v8i16_lshr: @@ -859,13 +857,11 @@ define <8 x i32> @mulhsw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: mulhsw_v8i16_lshr: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pmulhw %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pmulhw %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhsw_v8i16_lshr: @@ -925,18 +921,17 @@ define <16 x i32> @zext_mulhuw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) { ; SSE2-LABEL: zext_mulhuw_v16i16_lshr: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pmulhuw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pmulhuw %xmm2, %xmm4 ; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE2-NEXT: pmulhuw %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pmulhuw %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: zext_mulhuw_v16i16_lshr: @@ -976,18 +971,17 @@ define <16 x i32> @mulhsw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) { ; SSE2-LABEL: mulhsw_v16i16_lshr: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pmulhw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pmulhw %xmm2, %xmm4 ; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE2-NEXT: pmulhw %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pmulhw %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhsw_v16i16_lshr: Index: llvm/test/CodeGen/X86/popcnt.ll =================================================================== --- llvm/test/CodeGen/X86/popcnt.ll +++ llvm/test/CodeGen/X86/popcnt.ll @@ -31,14 +31,14 @@ ; X64-NEXT: shrb %al ; X64-NEXT: andb $85, %al ; X64-NEXT: subb %al, %dil -; X64-NEXT: movl %edi, %eax -; X64-NEXT: andb $51, %al +; X64-NEXT: movl %edi, %ecx +; X64-NEXT: andb $51, %cl ; X64-NEXT: shrb $2, %dil ; X64-NEXT: andb $51, %dil -; X64-NEXT: addb %al, %dil -; X64-NEXT: movl %edi, %eax +; X64-NEXT: addb %dil, %cl +; X64-NEXT: movl %ecx, %eax ; X64-NEXT: shrb $4, %al -; X64-NEXT: addb %dil, %al +; X64-NEXT: addb %cl, %al ; X64-NEXT: andb $15, %al ; X64-NEXT: retq ; @@ -225,15 +225,15 @@ ; X64-NEXT: movq %rdi, %rcx ; X64-NEXT: andq %rax, %rcx ; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %rax, %rdi -; X64-NEXT: addq %rcx, %rdi -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shrq $4, %rax -; X64-NEXT: addq %rdi, %rax -; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %rax, %rcx +; X64-NEXT: andq %rdi, %rax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: shrq $4, %rcx +; X64-NEXT: addq %rax, %rcx +; X64-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F +; X64-NEXT: andq %rcx, %rdx ; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 -; X64-NEXT: imulq %rcx, %rax +; X64-NEXT: imulq %rdx, %rax ; X64-NEXT: shrq $56, %rax ; X64-NEXT: retq ; @@ -385,36 +385,36 @@ ; X64-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555 ; X64-NEXT: andq %r8, %rax ; X64-NEXT: subq %rax, %rsi -; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 -; X64-NEXT: movq %rsi, %rcx -; X64-NEXT: andq %rax, %rcx +; X64-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: andq %rcx, %rax ; X64-NEXT: shrq $2, %rsi -; X64-NEXT: andq %rax, %rsi -; X64-NEXT: addq %rcx, %rsi -; X64-NEXT: movq %rsi, %rcx -; X64-NEXT: shrq $4, %rcx -; X64-NEXT: addq %rsi, %rcx +; X64-NEXT: andq %rcx, %rsi +; X64-NEXT: addq %rsi, %rax +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $4, %rdx +; X64-NEXT: addq %rax, %rdx ; X64-NEXT: movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %r9, %rcx -; X64-NEXT: movabsq $72340172838076673, %rdx # imm = 0x101010101010101 -; X64-NEXT: imulq %rdx, %rcx -; X64-NEXT: shrq $56, %rcx -; X64-NEXT: movq %rdi, %rsi -; X64-NEXT: shrq %rsi -; X64-NEXT: andq %r8, %rsi -; X64-NEXT: subq %rsi, %rdi -; X64-NEXT: movq %rdi, %rsi -; X64-NEXT: andq %rax, %rsi -; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %rax, %rdi -; X64-NEXT: addq %rsi, %rdi +; X64-NEXT: andq %r9, %rdx +; X64-NEXT: movabsq $72340172838076673, %rsi # imm = 0x101010101010101 +; X64-NEXT: imulq %rsi, %rdx +; X64-NEXT: shrq $56, %rdx +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shrq %rax +; X64-NEXT: andq %r8, %rax +; X64-NEXT: subq %rax, %rdi ; X64-NEXT: movq %rdi, %rax +; X64-NEXT: andq %rcx, %rax +; X64-NEXT: shrq $2, %rdi +; X64-NEXT: andq %rdi, %rcx +; X64-NEXT: addq %rax, %rcx +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: shrq $4, %rax -; X64-NEXT: addq %rdi, %rax +; X64-NEXT: addq %rcx, %rax ; X64-NEXT: andq %r9, %rax -; X64-NEXT: imulq %rdx, %rax +; X64-NEXT: imulq %rsi, %rax ; X64-NEXT: shrq $56, %rax -; X64-NEXT: addq %rcx, %rax +; X64-NEXT: addq %rdx, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq ; @@ -579,15 +579,15 @@ ; X64-NEXT: movq %rdi, %rcx ; X64-NEXT: andq %rax, %rcx ; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %rax, %rdi -; X64-NEXT: addq %rcx, %rdi -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shrq $4, %rax -; X64-NEXT: addq %rdi, %rax -; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %rax, %rcx +; X64-NEXT: andq %rdi, %rax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: shrq $4, %rcx +; X64-NEXT: addq %rax, %rcx +; X64-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F +; X64-NEXT: andq %rcx, %rdx ; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 -; X64-NEXT: imulq %rcx, %rax +; X64-NEXT: imulq %rdx, %rax ; X64-NEXT: shrq $56, %rax ; X64-NEXT: retq ; @@ -721,15 +721,15 @@ ; X64-NEXT: movq %rdi, %rcx ; X64-NEXT: andq %rax, %rcx ; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %rax, %rdi -; X64-NEXT: addq %rcx, %rdi -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shrq $4, %rax -; X64-NEXT: addq %rdi, %rax -; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %rax, %rcx +; X64-NEXT: andq %rdi, %rax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: shrq $4, %rcx +; X64-NEXT: addq %rax, %rcx +; X64-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F +; X64-NEXT: andq %rcx, %rdx ; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 -; X64-NEXT: imulq %rcx, %rax +; X64-NEXT: imulq %rdx, %rax ; X64-NEXT: shrq $56, %rax ; X64-NEXT: retq ; @@ -890,36 +890,36 @@ ; X64-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555 ; X64-NEXT: andq %r8, %rax ; X64-NEXT: subq %rax, %rsi -; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 -; X64-NEXT: movq %rsi, %rcx -; X64-NEXT: andq %rax, %rcx +; X64-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: andq %rcx, %rax ; X64-NEXT: shrq $2, %rsi -; X64-NEXT: andq %rax, %rsi -; X64-NEXT: addq %rcx, %rsi -; X64-NEXT: movq %rsi, %rcx -; X64-NEXT: shrq $4, %rcx -; X64-NEXT: addq %rsi, %rcx +; X64-NEXT: andq %rcx, %rsi +; X64-NEXT: addq %rsi, %rax +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $4, %rdx +; X64-NEXT: addq %rax, %rdx ; X64-NEXT: movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %r9, %rcx -; X64-NEXT: movabsq $72340172838076673, %rdx # imm = 0x101010101010101 -; X64-NEXT: imulq %rdx, %rcx -; X64-NEXT: shrq $56, %rcx -; X64-NEXT: movq %rdi, %rsi -; X64-NEXT: shrq %rsi -; X64-NEXT: andq %r8, %rsi -; X64-NEXT: subq %rsi, %rdi -; X64-NEXT: movq %rdi, %rsi -; X64-NEXT: andq %rax, %rsi -; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %rax, %rdi -; X64-NEXT: addq %rsi, %rdi +; X64-NEXT: andq %r9, %rdx +; X64-NEXT: movabsq $72340172838076673, %rsi # imm = 0x101010101010101 +; X64-NEXT: imulq %rsi, %rdx +; X64-NEXT: shrq $56, %rdx +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shrq %rax +; X64-NEXT: andq %r8, %rax +; X64-NEXT: subq %rax, %rdi ; X64-NEXT: movq %rdi, %rax +; X64-NEXT: andq %rcx, %rax +; X64-NEXT: shrq $2, %rdi +; X64-NEXT: andq %rdi, %rcx +; X64-NEXT: addq %rax, %rcx +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: shrq $4, %rax -; X64-NEXT: addq %rdi, %rax +; X64-NEXT: addq %rcx, %rax ; X64-NEXT: andq %r9, %rax -; X64-NEXT: imulq %rdx, %rax +; X64-NEXT: imulq %rsi, %rax ; X64-NEXT: shrq $56, %rax -; X64-NEXT: addq %rcx, %rax +; X64-NEXT: addq %rdx, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq ; @@ -1151,15 +1151,15 @@ ; X64-NEXT: movq %rdi, %rcx ; X64-NEXT: andq %rax, %rcx ; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %rax, %rdi -; X64-NEXT: addq %rcx, %rdi -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shrq $4, %rax -; X64-NEXT: addq %rdi, %rax -; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %rax, %rcx +; X64-NEXT: andq %rdi, %rax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: shrq $4, %rcx +; X64-NEXT: addq %rax, %rcx +; X64-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F +; X64-NEXT: andq %rcx, %rdx ; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 -; X64-NEXT: imulq %rcx, %rax +; X64-NEXT: imulq %rdx, %rax ; X64-NEXT: shrq $56, %rax ; X64-NEXT: retq ; @@ -1320,36 +1320,36 @@ ; X64-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555 ; X64-NEXT: andq %r8, %rax ; X64-NEXT: subq %rax, %rsi -; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 -; X64-NEXT: movq %rsi, %rcx -; X64-NEXT: andq %rax, %rcx +; X64-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: andq %rcx, %rax ; X64-NEXT: shrq $2, %rsi -; X64-NEXT: andq %rax, %rsi -; X64-NEXT: addq %rcx, %rsi -; X64-NEXT: movq %rsi, %rcx -; X64-NEXT: shrq $4, %rcx -; X64-NEXT: addq %rsi, %rcx +; X64-NEXT: andq %rcx, %rsi +; X64-NEXT: addq %rsi, %rax +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $4, %rdx +; X64-NEXT: addq %rax, %rdx ; X64-NEXT: movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F -; X64-NEXT: andq %r9, %rcx -; X64-NEXT: movabsq $72340172838076673, %rdx # imm = 0x101010101010101 -; X64-NEXT: imulq %rdx, %rcx -; X64-NEXT: shrq $56, %rcx -; X64-NEXT: movq %rdi, %rsi -; X64-NEXT: shrq %rsi -; X64-NEXT: andq %r8, %rsi -; X64-NEXT: subq %rsi, %rdi -; X64-NEXT: movq %rdi, %rsi -; X64-NEXT: andq %rax, %rsi -; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %rax, %rdi -; X64-NEXT: addq %rsi, %rdi +; X64-NEXT: andq %r9, %rdx +; X64-NEXT: movabsq $72340172838076673, %rsi # imm = 0x101010101010101 +; X64-NEXT: imulq %rsi, %rdx +; X64-NEXT: shrq $56, %rdx ; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shrq %rax +; X64-NEXT: andq %r8, %rax +; X64-NEXT: subq %rax, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: andq %rcx, %rax +; X64-NEXT: shrq $2, %rdi +; X64-NEXT: andq %rdi, %rcx +; X64-NEXT: addq %rax, %rcx +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: shrq $4, %rax -; X64-NEXT: addq %rdi, %rax +; X64-NEXT: addq %rcx, %rax ; X64-NEXT: andq %r9, %rax -; X64-NEXT: imulq %rdx, %rax +; X64-NEXT: imulq %rsi, %rax ; X64-NEXT: shrq $56, %rax -; X64-NEXT: addq %rcx, %rax +; X64-NEXT: addq %rdx, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq ; Index: llvm/test/CodeGen/X86/pull-binop-through-shift.ll =================================================================== --- llvm/test/CodeGen/X86/pull-binop-through-shift.ll +++ llvm/test/CodeGen/X86/pull-binop-through-shift.ll @@ -140,9 +140,9 @@ define i32 @add_signbit_shl(i32 %x, i32* %dst) { ; X64-LABEL: add_signbit_shl: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $8, %eax -; X64-NEXT: addl $-16777216, %eax # imm = 0xFF000000 +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: shll $8, %edi +; X64-NEXT: leal -16777216(%rdi), %eax ; X64-NEXT: movl %eax, (%rsi) ; X64-NEXT: retq ; @@ -162,9 +162,9 @@ define i32 @add_nosignbit_shl(i32 %x, i32* %dst) { ; X64-LABEL: add_nosignbit_shl: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $8, %eax -; X64-NEXT: addl $-16777216, %eax # imm = 0xFF000000 +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: shll $8, %edi +; X64-NEXT: leal -16777216(%rdi), %eax ; X64-NEXT: movl %eax, (%rsi) ; X64-NEXT: retq ; @@ -322,8 +322,8 @@ define i32 @add_signbit_lshr(i32 %x, i32* %dst) { ; X64-LABEL: add_signbit_lshr: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: addl $-65536, %eax # imm = 0xFFFF0000 +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal -65536(%rdi), %eax ; X64-NEXT: shrl $8, %eax ; X64-NEXT: movl %eax, (%rsi) ; X64-NEXT: retq @@ -344,8 +344,8 @@ define i32 @add_nosignbit_lshr(i32 %x, i32* %dst) { ; X64-LABEL: add_nosignbit_lshr: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: addl $2147418112, %eax # imm = 0x7FFF0000 +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal 2147418112(%rdi), %eax ; X64-NEXT: shrl $8, %eax ; X64-NEXT: movl %eax, (%rsi) ; X64-NEXT: retq @@ -503,8 +503,8 @@ define i32 @add_signbit_ashr(i32 %x, i32* %dst) { ; X64-LABEL: add_signbit_ashr: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: addl $-65536, %eax # imm = 0xFFFF0000 +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal -65536(%rdi), %eax ; X64-NEXT: sarl $8, %eax ; X64-NEXT: movl %eax, (%rsi) ; X64-NEXT: retq @@ -525,8 +525,8 @@ define i32 @add_nosignbit_ashr(i32 %x, i32* %dst) { ; X64-LABEL: add_nosignbit_ashr: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: addl $2147418112, %eax # imm = 0x7FFF0000 +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal 2147418112(%rdi), %eax ; X64-NEXT: sarl $8, %eax ; X64-NEXT: movl %eax, (%rsi) ; X64-NEXT: retq Index: llvm/test/CodeGen/X86/rem.ll =================================================================== --- llvm/test/CodeGen/X86/rem.ll +++ llvm/test/CodeGen/X86/rem.ll @@ -8,16 +8,15 @@ ; CHECK-NEXT: movl $-2139062143, %edx # imm = 0x80808081 ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: imull %edx -; CHECK-NEXT: addl %ecx, %edx -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: sarl $7, %edx -; CHECK-NEXT: addl %eax, %edx -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: shll $8, %eax -; CHECK-NEXT: subl %eax, %edx -; CHECK-NEXT: addl %edx, %ecx -; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: leal (%edx,%ecx), %eax +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: shrl $31, %edx +; CHECK-NEXT: sarl $7, %eax +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: shll $8, %edx +; CHECK-NEXT: subl %edx, %eax +; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: retl %tmp1 = srem i32 %X, 255 ret i32 %tmp1 Index: llvm/test/CodeGen/X86/sat-add.ll =================================================================== --- llvm/test/CodeGen/X86/sat-add.ll +++ llvm/test/CodeGen/X86/sat-add.ll @@ -1265,11 +1265,12 @@ ; SSE42-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; SSE42-NEXT: paddq %xmm0, %xmm1 -; SSE42-NEXT: pxor %xmm2, %xmm0 -; SSE42-NEXT: pxor %xmm1, %xmm2 -; SSE42-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE42-NEXT: por %xmm1, %xmm0 +; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: pxor %xmm2, %xmm3 +; SSE42-NEXT: paddq %xmm1, %xmm0 +; SSE42-NEXT: pxor %xmm0, %xmm2 +; SSE42-NEXT: pcmpgtq %xmm2, %xmm3 +; SSE42-NEXT: por %xmm3, %xmm0 ; SSE42-NEXT: retq ; ; AVX2-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum: Index: llvm/test/CodeGen/X86/sdiv_fix_sat.ll =================================================================== --- llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -29,15 +29,15 @@ ; X64-NEXT: testl %edx, %edx ; X64-NEXT: setne %dl ; X64-NEXT: testb %cl, %dl -; X64-NEXT: cmovel %eax, %edi -; X64-NEXT: cmpl $65535, %edi # imm = 0xFFFF +; X64-NEXT: cmovnel %edi, %eax +; X64-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-NEXT: movl $65535, %ecx # imm = 0xFFFF -; X64-NEXT: cmovll %edi, %ecx -; X64-NEXT: cmpl $-65535, %ecx # imm = 0xFFFF0001 -; X64-NEXT: movl $-65536, %eax # imm = 0xFFFF0000 ; X64-NEXT: cmovgel %ecx, %eax +; X64-NEXT: cmpl $-65535, %eax # imm = 0xFFFF0001 +; X64-NEXT: movl $-65536, %ecx # imm = 0xFFFF0000 +; X64-NEXT: cmovll %ecx, %eax ; X64-NEXT: shrl %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: # kill: def $ax killed $ax killed $rax ; X64-NEXT: retq ; ; X86-LABEL: func: @@ -45,14 +45,14 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movswl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movswl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $8, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cltd -; X86-NEXT: idivl %edi -; X86-NEXT: leal -1(%eax), %esi -; X86-NEXT: testl %edi, %edi +; X86-NEXT: idivl %esi +; X86-NEXT: leal -1(%eax), %edi +; X86-NEXT: testl %esi, %esi ; X86-NEXT: sets %bl ; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sets %cl @@ -60,13 +60,13 @@ ; X86-NEXT: testl %edx, %edx ; X86-NEXT: setne %dl ; X86-NEXT: testb %cl, %dl -; X86-NEXT: cmovel %eax, %esi -; X86-NEXT: cmpl $65535, %esi # imm = 0xFFFF +; X86-NEXT: cmovnel %edi, %eax +; X86-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-NEXT: movl $65535, %ecx # imm = 0xFFFF -; X86-NEXT: cmovll %esi, %ecx -; X86-NEXT: cmpl $-65535, %ecx # imm = 0xFFFF0001 -; X86-NEXT: movl $-65536, %eax # imm = 0xFFFF0000 ; X86-NEXT: cmovgel %ecx, %eax +; X86-NEXT: cmpl $-65535, %eax # imm = 0xFFFF0001 +; X86-NEXT: movl $-65536, %ecx # imm = 0xFFFF0000 +; X86-NEXT: cmovll %ecx, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi @@ -99,14 +99,14 @@ ; X64-NEXT: testl %edx, %edx ; X64-NEXT: setne %dl ; X64-NEXT: testb %cl, %dl -; X64-NEXT: cmovel %eax, %edi -; X64-NEXT: cmpl $16383, %edi # imm = 0x3FFF +; X64-NEXT: cmovnel %edi, %eax +; X64-NEXT: cmpl $16383, %eax # imm = 0x3FFF ; X64-NEXT: movl $16383, %ecx # imm = 0x3FFF -; X64-NEXT: cmovll %edi, %ecx -; X64-NEXT: cmpl $-16383, %ecx # imm = 0xC001 -; X64-NEXT: movl $-16384, %eax # imm = 0xC000 ; X64-NEXT: cmovgel %ecx, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: cmpl $-16383, %eax # imm = 0xC001 +; X64-NEXT: movl $-16384, %ecx # imm = 0xC000 +; X64-NEXT: cmovll %ecx, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $rax ; X64-NEXT: retq ; ; X86-LABEL: func2: @@ -114,14 +114,14 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movsbl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $14, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cltd -; X86-NEXT: idivl %edi -; X86-NEXT: leal -1(%eax), %esi -; X86-NEXT: testl %edi, %edi +; X86-NEXT: idivl %esi +; X86-NEXT: leal -1(%eax), %edi +; X86-NEXT: testl %esi, %esi ; X86-NEXT: sets %bl ; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sets %cl @@ -129,13 +129,13 @@ ; X86-NEXT: testl %edx, %edx ; X86-NEXT: setne %dl ; X86-NEXT: testb %cl, %dl -; X86-NEXT: cmovel %eax, %esi -; X86-NEXT: cmpl $16383, %esi # imm = 0x3FFF +; X86-NEXT: cmovnel %edi, %eax +; X86-NEXT: cmpl $16383, %eax # imm = 0x3FFF ; X86-NEXT: movl $16383, %ecx # imm = 0x3FFF -; X86-NEXT: cmovll %esi, %ecx -; X86-NEXT: cmpl $-16383, %ecx # imm = 0xC001 -; X86-NEXT: movl $-16384, %eax # imm = 0xC000 ; X86-NEXT: cmovgel %ecx, %eax +; X86-NEXT: cmpl $-16383, %eax # imm = 0xC001 +; X86-NEXT: movl $-16384, %ecx # imm = 0xC000 +; X86-NEXT: cmovll %ecx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -169,16 +169,16 @@ ; X64-NEXT: testw %dx, %dx ; X64-NEXT: setne %dl ; X64-NEXT: testb %cl, %dl -; X64-NEXT: cmovel %eax, %esi -; X64-NEXT: movswl %si, %eax -; X64-NEXT: cmpl $16383, %eax # imm = 0x3FFF +; X64-NEXT: cmovnel %esi, %eax +; X64-NEXT: movswl %ax, %ecx +; X64-NEXT: cmpl $16383, %ecx # imm = 0x3FFF ; X64-NEXT: movl $16383, %ecx # imm = 0x3FFF -; X64-NEXT: cmovll %esi, %ecx -; X64-NEXT: movswl %cx, %eax -; X64-NEXT: cmpl $-16383, %eax # imm = 0xC001 -; X64-NEXT: movl $49152, %eax # imm = 0xC000 ; X64-NEXT: cmovgel %ecx, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: movswl %ax, %ecx +; X64-NEXT: cmpl $-16383, %ecx # imm = 0xC001 +; X64-NEXT: movl $49152, %ecx # imm = 0xC000 +; X64-NEXT: cmovll %ecx, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $rax ; X64-NEXT: retq ; ; X86-LABEL: func3: @@ -188,31 +188,31 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shll $8, %eax -; X86-NEXT: movswl %ax, %edi +; X86-NEXT: movswl %ax, %esi ; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: shrl $4, %edi +; X86-NEXT: shrl $4, %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cwtd -; X86-NEXT: idivw %di +; X86-NEXT: idivw %si ; X86-NEXT: # kill: def $ax killed $ax def $eax -; X86-NEXT: leal -1(%eax), %esi +; X86-NEXT: leal -1(%eax), %edi ; X86-NEXT: testw %cx, %cx ; X86-NEXT: sets %cl -; X86-NEXT: testw %di, %di +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %ch ; X86-NEXT: xorb %cl, %ch ; X86-NEXT: testw %dx, %dx ; X86-NEXT: setne %cl ; X86-NEXT: testb %ch, %cl -; X86-NEXT: cmovel %eax, %esi -; X86-NEXT: movswl %si, %eax -; X86-NEXT: cmpl $16383, %eax # imm = 0x3FFF +; X86-NEXT: cmovnel %edi, %eax +; X86-NEXT: movswl %ax, %ecx +; X86-NEXT: cmpl $16383, %ecx # imm = 0x3FFF ; X86-NEXT: movl $16383, %ecx # imm = 0x3FFF -; X86-NEXT: cmovll %esi, %ecx -; X86-NEXT: movswl %cx, %eax -; X86-NEXT: cmpl $-16383, %eax # imm = 0xC001 -; X86-NEXT: movl $49152, %eax # imm = 0xC000 ; X86-NEXT: cmovgel %ecx, %eax +; X86-NEXT: movswl %ax, %ecx +; X86-NEXT: cmpl $-16383, %ecx # imm = 0xC001 +; X86-NEXT: movl $49152, %ecx # imm = 0xC000 +; X86-NEXT: cmovll %ecx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -521,13 +521,14 @@ ; X64-NEXT: testl %edx, %edx ; X64-NEXT: setne %dl ; X64-NEXT: testb %cl, %dl -; X64-NEXT: cmovel %eax, %edi -; X64-NEXT: cmpl $131071, %edi # imm = 0x1FFFF +; X64-NEXT: cmovnel %edi, %eax +; X64-NEXT: cmpl $131071, %eax # imm = 0x1FFFF ; X64-NEXT: movl $131071, %ecx # imm = 0x1FFFF -; X64-NEXT: cmovll %edi, %ecx -; X64-NEXT: cmpl $-131071, %ecx # imm = 0xFFFE0001 -; X64-NEXT: movl $-131072, %eax # imm = 0xFFFE0000 ; X64-NEXT: cmovgel %ecx, %eax +; X64-NEXT: cmpl $-131071, %eax # imm = 0xFFFE0001 +; X64-NEXT: movl $-131072, %ecx # imm = 0xFFFE0000 +; X64-NEXT: cmovll %ecx, %eax +; X64-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NEXT: retq ; ; X86-LABEL: func6: @@ -535,14 +536,14 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movswl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movswl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $7, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cltd -; X86-NEXT: idivl %edi -; X86-NEXT: leal -1(%eax), %esi -; X86-NEXT: testl %edi, %edi +; X86-NEXT: idivl %esi +; X86-NEXT: leal -1(%eax), %edi +; X86-NEXT: testl %esi, %esi ; X86-NEXT: sets %bl ; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sets %cl @@ -550,13 +551,13 @@ ; X86-NEXT: testl %edx, %edx ; X86-NEXT: setne %dl ; X86-NEXT: testb %cl, %dl -; X86-NEXT: cmovel %eax, %esi -; X86-NEXT: cmpl $131071, %esi # imm = 0x1FFFF +; X86-NEXT: cmovnel %edi, %eax +; X86-NEXT: cmpl $131071, %eax # imm = 0x1FFFF ; X86-NEXT: movl $131071, %ecx # imm = 0x1FFFF -; X86-NEXT: cmovll %esi, %ecx -; X86-NEXT: cmpl $-131071, %ecx # imm = 0xFFFE0001 -; X86-NEXT: movl $-131072, %eax # imm = 0xFFFE0000 ; X86-NEXT: cmovgel %ecx, %eax +; X86-NEXT: cmpl $-131071, %eax # imm = 0xFFFE0001 +; X86-NEXT: movl $-131072, %ecx # imm = 0xFFFE0000 +; X86-NEXT: cmovll %ecx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx Index: llvm/test/CodeGen/X86/setcc-combine.ll =================================================================== --- llvm/test/CodeGen/X86/setcc-combine.ll +++ llvm/test/CodeGen/X86/setcc-combine.ll @@ -326,11 +326,10 @@ ; SSE2-LABEL: sub_to_shift_to_add_vec: ; SSE2: # %bb.0: ; SSE2-NEXT: paddd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm3, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: sub_to_shift_to_add_vec: Index: llvm/test/CodeGen/X86/shift-combine.ll =================================================================== --- llvm/test/CodeGen/X86/shift-combine.ll +++ llvm/test/CodeGen/X86/shift-combine.ll @@ -317,9 +317,9 @@ ; ; X64-LABEL: ashr_add_shl_i32_i8_extra_use1: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $24, %eax -; X64-NEXT: addl $33554432, %eax # imm = 0x2000000 +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: shll $24, %edi +; X64-NEXT: leal 33554432(%rdi), %eax ; X64-NEXT: movl %eax, (%rsi) ; X64-NEXT: sarl $24, %eax ; X64-NEXT: retq @@ -371,10 +371,10 @@ ; ; X64-LABEL: ashr_add_shl_i32_i8_extra_use3: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $24, %eax -; X64-NEXT: movl %eax, (%rsi) -; X64-NEXT: addl $33554432, %eax # imm = 0x2000000 +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: shll $24, %edi +; X64-NEXT: movl %edi, (%rsi) +; X64-NEXT: leal 33554432(%rdi), %eax ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: sarl $24, %eax ; X64-NEXT: retq Index: llvm/test/CodeGen/X86/shl-crash-on-legalize.ll =================================================================== --- llvm/test/CodeGen/X86/shl-crash-on-legalize.ll +++ llvm/test/CodeGen/X86/shl-crash-on-legalize.ll @@ -11,14 +11,13 @@ define i32 @PR29058(i8 %x, i32 %y) { ; CHECK-LABEL: PR29058: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl %esi, %ecx ; CHECK-NEXT: testb %dil, %dil ; CHECK-NEXT: movl $2147483646, %eax # imm = 0x7FFFFFFE ; CHECK-NEXT: cmovnel %esi, %eax -; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: cmpb $1, %dil -; CHECK-NEXT: sbbl %edx, %edx -; CHECK-NEXT: orb %dl, %cl +; CHECK-NEXT: sbbl %ecx, %ecx +; CHECK-NEXT: orb %sil, %cl ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shll %cl, %eax ; CHECK-NEXT: movq %rax, structMember(%rip) Index: llvm/test/CodeGen/X86/slow-pmulld.ll =================================================================== --- llvm/test/CodeGen/X86/slow-pmulld.ll +++ llvm/test/CodeGen/X86/slow-pmulld.ll @@ -248,17 +248,16 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SLM32-LABEL: test_mul_v16i32_v16i8: ; SLM32: # %bb.0: -; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLM32-NEXT: movdqa %xmm0, %xmm3 -; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLM32-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778] ; SLM32-NEXT: pxor %xmm4, %xmm4 +; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SLM32-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SLM32-NEXT: movdqa %xmm1, %xmm2 -; SLM32-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SLM32-NEXT: movdqa %xmm3, %xmm4 -; SLM32-NEXT: pmullw %xmm0, %xmm1 -; SLM32-NEXT: pmulhw %xmm0, %xmm2 +; SLM32-NEXT: movdqa %xmm0, %xmm4 +; SLM32-NEXT: pmullw %xmm3, %xmm1 +; SLM32-NEXT: pmulhw %xmm3, %xmm2 +; SLM32-NEXT: pmulhw %xmm3, %xmm4 ; SLM32-NEXT: pmullw %xmm0, %xmm3 -; SLM32-NEXT: pmulhw %xmm0, %xmm4 ; SLM32-NEXT: movdqa %xmm1, %xmm0 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] @@ -269,17 +268,16 @@ ; ; SLM64-LABEL: test_mul_v16i32_v16i8: ; SLM64: # %bb.0: -; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLM64-NEXT: movdqa %xmm0, %xmm3 -; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLM64-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778] ; SLM64-NEXT: pxor %xmm4, %xmm4 +; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SLM64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SLM64-NEXT: movdqa %xmm1, %xmm2 -; SLM64-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SLM64-NEXT: movdqa %xmm3, %xmm4 -; SLM64-NEXT: pmullw %xmm0, %xmm1 -; SLM64-NEXT: pmulhw %xmm0, %xmm2 +; SLM64-NEXT: movdqa %xmm0, %xmm4 +; SLM64-NEXT: pmullw %xmm3, %xmm1 +; SLM64-NEXT: pmulhw %xmm3, %xmm2 +; SLM64-NEXT: pmulhw %xmm3, %xmm4 ; SLM64-NEXT: pmullw %xmm0, %xmm3 -; SLM64-NEXT: pmulhw %xmm0, %xmm4 ; SLM64-NEXT: movdqa %xmm1, %xmm0 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] @@ -290,44 +288,44 @@ ; ; SLOW32-LABEL: test_mul_v16i32_v16i8: ; SLOW32: # %bb.0: -; SLOW32-NEXT: movdqa %xmm0, %xmm3 ; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLOW32-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLOW32-NEXT: movdqa %xmm1, %xmm2 +; SLOW32-NEXT: pmulhw %xmm3, %xmm2 +; SLOW32-NEXT: pmullw %xmm3, %xmm1 ; SLOW32-NEXT: movdqa %xmm1, %xmm4 -; SLOW32-NEXT: pmulhw %xmm2, %xmm4 -; SLOW32-NEXT: pmullw %xmm2, %xmm1 -; SLOW32-NEXT: movdqa %xmm1, %xmm0 -; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SLOW32-NEXT: pxor %xmm4, %xmm4 -; SLOW32-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SLOW32-NEXT: movdqa %xmm3, %xmm4 -; SLOW32-NEXT: pmulhw %xmm2, %xmm4 -; SLOW32-NEXT: pmullw %xmm2, %xmm3 +; SLOW32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLOW32-NEXT: pxor %xmm2, %xmm2 +; SLOW32-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SLOW32-NEXT: movdqa %xmm0, %xmm5 +; SLOW32-NEXT: pmulhw %xmm3, %xmm5 +; SLOW32-NEXT: pmullw %xmm0, %xmm3 ; SLOW32-NEXT: movdqa %xmm3, %xmm2 -; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SLOW32-NEXT: movdqa %xmm4, %xmm0 ; SLOW32-NEXT: retl ; ; SLOW64-LABEL: test_mul_v16i32_v16i8: ; SLOW64: # %bb.0: -; SLOW64-NEXT: movdqa %xmm0, %xmm3 ; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLOW64-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLOW64-NEXT: movdqa %xmm1, %xmm2 +; SLOW64-NEXT: pmulhw %xmm3, %xmm2 +; SLOW64-NEXT: pmullw %xmm3, %xmm1 ; SLOW64-NEXT: movdqa %xmm1, %xmm4 -; SLOW64-NEXT: pmulhw %xmm2, %xmm4 -; SLOW64-NEXT: pmullw %xmm2, %xmm1 -; SLOW64-NEXT: movdqa %xmm1, %xmm0 -; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SLOW64-NEXT: pxor %xmm4, %xmm4 -; SLOW64-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SLOW64-NEXT: movdqa %xmm3, %xmm4 -; SLOW64-NEXT: pmulhw %xmm2, %xmm4 -; SLOW64-NEXT: pmullw %xmm2, %xmm3 +; SLOW64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLOW64-NEXT: pxor %xmm2, %xmm2 +; SLOW64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SLOW64-NEXT: movdqa %xmm0, %xmm5 +; SLOW64-NEXT: pmulhw %xmm3, %xmm5 +; SLOW64-NEXT: pmullw %xmm0, %xmm3 ; SLOW64-NEXT: movdqa %xmm3, %xmm2 -; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SLOW64-NEXT: movdqa %xmm4, %xmm0 ; SLOW64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v16i32_v16i8: @@ -498,11 +496,10 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) { ; SLM32-LABEL: test_mul_v8i32_v8i16: ; SLM32: # %bb.0: -; SLM32-NEXT: movdqa %xmm0, %xmm1 -; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLM32-NEXT: movdqa %xmm1, %xmm2 +; SLM32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLM32-NEXT: movdqa %xmm0, %xmm2 +; SLM32-NEXT: pmulhuw %xmm1, %xmm2 ; SLM32-NEXT: pmullw %xmm0, %xmm1 -; SLM32-NEXT: pmulhuw %xmm0, %xmm2 ; SLM32-NEXT: movdqa %xmm1, %xmm0 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] @@ -510,11 +507,10 @@ ; ; SLM64-LABEL: test_mul_v8i32_v8i16: ; SLM64: # %bb.0: -; SLM64-NEXT: movdqa %xmm0, %xmm1 -; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLM64-NEXT: movdqa %xmm1, %xmm2 +; SLM64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLM64-NEXT: movdqa %xmm0, %xmm2 +; SLM64-NEXT: pmulhuw %xmm1, %xmm2 ; SLM64-NEXT: pmullw %xmm0, %xmm1 -; SLM64-NEXT: pmulhuw %xmm0, %xmm2 ; SLM64-NEXT: movdqa %xmm1, %xmm0 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] @@ -522,10 +518,9 @@ ; ; SLOW32-LABEL: test_mul_v8i32_v8i16: ; SLOW32: # %bb.0: -; SLOW32-NEXT: movdqa %xmm0, %xmm1 -; SLOW32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLOW32-NEXT: movdqa %xmm1, %xmm2 -; SLOW32-NEXT: pmulhuw %xmm0, %xmm2 +; SLOW32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLOW32-NEXT: movdqa %xmm0, %xmm2 +; SLOW32-NEXT: pmulhuw %xmm1, %xmm2 ; SLOW32-NEXT: pmullw %xmm0, %xmm1 ; SLOW32-NEXT: movdqa %xmm1, %xmm0 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] @@ -534,10 +529,9 @@ ; ; SLOW64-LABEL: test_mul_v8i32_v8i16: ; SLOW64: # %bb.0: -; SLOW64-NEXT: movdqa %xmm0, %xmm1 -; SLOW64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLOW64-NEXT: movdqa %xmm1, %xmm2 -; SLOW64-NEXT: pmulhuw %xmm0, %xmm2 +; SLOW64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLOW64-NEXT: movdqa %xmm0, %xmm2 +; SLOW64-NEXT: pmulhuw %xmm1, %xmm2 ; SLOW64-NEXT: pmullw %xmm0, %xmm1 ; SLOW64-NEXT: movdqa %xmm1, %xmm0 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] @@ -597,78 +591,78 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) { ; SLM32-LABEL: test_mul_v16i32_v16i16: ; SLM32: # %bb.0: -; SLM32-NEXT: movdqa %xmm1, %xmm3 -; SLM32-NEXT: movdqa %xmm0, %xmm1 -; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLM32-NEXT: movdqa %xmm1, %xmm2 -; SLM32-NEXT: movdqa %xmm3, %xmm4 -; SLM32-NEXT: pmullw %xmm0, %xmm1 -; SLM32-NEXT: pmulhuw %xmm0, %xmm2 -; SLM32-NEXT: pmullw %xmm0, %xmm3 -; SLM32-NEXT: pmulhuw %xmm0, %xmm4 -; SLM32-NEXT: movdqa %xmm1, %xmm0 -; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLM32-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLM32-NEXT: movdqa %xmm0, %xmm4 +; SLM32-NEXT: movdqa %xmm0, %xmm2 +; SLM32-NEXT: movdqa %xmm1, %xmm5 +; SLM32-NEXT: pmullw %xmm3, %xmm4 +; SLM32-NEXT: pmulhuw %xmm3, %xmm2 +; SLM32-NEXT: pmulhuw %xmm3, %xmm5 +; SLM32-NEXT: pmullw %xmm1, %xmm3 +; SLM32-NEXT: movdqa %xmm4, %xmm0 +; SLM32-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SLM32-NEXT: movdqa %xmm3, %xmm2 -; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SLM32-NEXT: movdqa %xmm4, %xmm1 +; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; SLM32-NEXT: retl ; ; SLM64-LABEL: test_mul_v16i32_v16i16: ; SLM64: # %bb.0: -; SLM64-NEXT: movdqa %xmm1, %xmm3 -; SLM64-NEXT: movdqa %xmm0, %xmm1 -; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; SLM64-NEXT: movdqa %xmm1, %xmm2 -; SLM64-NEXT: movdqa %xmm3, %xmm4 -; SLM64-NEXT: pmullw %xmm0, %xmm1 -; SLM64-NEXT: pmulhuw %xmm0, %xmm2 -; SLM64-NEXT: pmullw %xmm0, %xmm3 -; SLM64-NEXT: pmulhuw %xmm0, %xmm4 -; SLM64-NEXT: movdqa %xmm1, %xmm0 -; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLM64-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLM64-NEXT: movdqa %xmm0, %xmm4 +; SLM64-NEXT: movdqa %xmm0, %xmm2 +; SLM64-NEXT: movdqa %xmm1, %xmm5 +; SLM64-NEXT: pmullw %xmm3, %xmm4 +; SLM64-NEXT: pmulhuw %xmm3, %xmm2 +; SLM64-NEXT: pmulhuw %xmm3, %xmm5 +; SLM64-NEXT: pmullw %xmm1, %xmm3 +; SLM64-NEXT: movdqa %xmm4, %xmm0 +; SLM64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SLM64-NEXT: movdqa %xmm3, %xmm2 -; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SLM64-NEXT: movdqa %xmm4, %xmm1 +; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; SLM64-NEXT: retq ; ; SLOW32-LABEL: test_mul_v16i32_v16i16: ; SLOW32: # %bb.0: -; SLOW32-NEXT: movdqa %xmm1, %xmm3 -; SLOW32-NEXT: movdqa %xmm0, %xmm1 -; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] ; SLOW32-NEXT: movdqa %xmm0, %xmm4 -; SLOW32-NEXT: pmulhuw %xmm2, %xmm4 -; SLOW32-NEXT: pmullw %xmm2, %xmm1 -; SLOW32-NEXT: movdqa %xmm1, %xmm0 -; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SLOW32-NEXT: movdqa %xmm3, %xmm4 -; SLOW32-NEXT: pmulhuw %xmm2, %xmm4 -; SLOW32-NEXT: pmullw %xmm2, %xmm3 +; SLOW32-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLOW32-NEXT: movdqa %xmm0, %xmm2 +; SLOW32-NEXT: pmulhuw %xmm3, %xmm2 +; SLOW32-NEXT: pmullw %xmm3, %xmm4 +; SLOW32-NEXT: movdqa %xmm4, %xmm0 +; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLOW32-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SLOW32-NEXT: movdqa %xmm1, %xmm5 +; SLOW32-NEXT: pmulhuw %xmm3, %xmm5 +; SLOW32-NEXT: pmullw %xmm1, %xmm3 ; SLOW32-NEXT: movdqa %xmm3, %xmm2 -; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SLOW32-NEXT: movdqa %xmm4, %xmm1 ; SLOW32-NEXT: retl ; ; SLOW64-LABEL: test_mul_v16i32_v16i16: ; SLOW64: # %bb.0: -; SLOW64-NEXT: movdqa %xmm1, %xmm3 -; SLOW64-NEXT: movdqa %xmm0, %xmm1 -; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] ; SLOW64-NEXT: movdqa %xmm0, %xmm4 -; SLOW64-NEXT: pmulhuw %xmm2, %xmm4 -; SLOW64-NEXT: pmullw %xmm2, %xmm1 -; SLOW64-NEXT: movdqa %xmm1, %xmm0 -; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SLOW64-NEXT: movdqa %xmm3, %xmm4 -; SLOW64-NEXT: pmulhuw %xmm2, %xmm4 -; SLOW64-NEXT: pmullw %xmm2, %xmm3 +; SLOW64-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLOW64-NEXT: movdqa %xmm0, %xmm2 +; SLOW64-NEXT: pmulhuw %xmm3, %xmm2 +; SLOW64-NEXT: pmullw %xmm3, %xmm4 +; SLOW64-NEXT: movdqa %xmm4, %xmm0 +; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLOW64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SLOW64-NEXT: movdqa %xmm1, %xmm5 +; SLOW64-NEXT: pmulhuw %xmm3, %xmm5 +; SLOW64-NEXT: pmullw %xmm1, %xmm3 ; SLOW64-NEXT: movdqa %xmm3, %xmm2 -; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SLOW64-NEXT: movdqa %xmm4, %xmm1 ; SLOW64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v16i32_v16i16: Index: llvm/test/CodeGen/X86/smul_fix.ll =================================================================== --- llvm/test/CodeGen/X86/smul_fix.ll +++ llvm/test/CodeGen/X86/smul_fix.ll @@ -56,12 +56,11 @@ ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: addl %edx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %ebx, %eax @@ -69,16 +68,17 @@ ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: adcl %edi, %edx ; X86-NEXT: movl %ebx, %edi -; X86-NEXT: imull {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl %edx, %edi -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: subl %ecx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: imull %ebp, %edi +; X86-NEXT: addl %edi, %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: subl %ecx, %edi ; X86-NEXT: testl %ebx, %ebx -; X86-NEXT: cmovnsl %edi, %ebp -; X86-NEXT: movl %ebp, %edx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebp, %edx +; X86-NEXT: cmovsl %edi, %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: testl %ebp, %ebp +; X86-NEXT: cmovsl %ecx, %edx ; X86-NEXT: shldl $30, %eax, %edx ; X86-NEXT: shldl $30, %esi, %eax ; X86-NEXT: popl %esi @@ -333,16 +333,17 @@ ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %edi, %edx ; X86-NEXT: movl %ebp, %edi -; X86-NEXT: imull {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl %edx, %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: subl %esi, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: imull %ebx, %edi +; X86-NEXT: addl %edi, %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: subl %esi, %edi ; X86-NEXT: testl %ebp, %ebp -; X86-NEXT: cmovnsl %edi, %ebx -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebx, %edx +; X86-NEXT: cmovsl %edi, %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: cmovsl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -368,33 +369,32 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %edx, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %edi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: imull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebp, %eax -; X86-NEXT: adcl %ebx, %edx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: subl %esi, %ecx -; X86-NEXT: movl %edi, %esi +; X86-NEXT: adcl %edx, %edi +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebx, %esi ; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %edi, %esi -; X86-NEXT: cmovnsl %edx, %ecx +; X86-NEXT: cmovnsl %ebx, %esi +; X86-NEXT: cmovnsl %edi, %ecx ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: subl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %esi, %edx Index: llvm/test/CodeGen/X86/smul_fix_sat.ll =================================================================== --- llvm/test/CodeGen/X86/smul_fix_sat.ll +++ llvm/test/CodeGen/X86/smul_fix_sat.ll @@ -61,26 +61,27 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %edi, %ecx +; X86-NEXT: addl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl %edi, %eax -; X86-NEXT: imull %esi +; X86-NEXT: imull %ebx ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi ; X86-NEXT: adcl %ebp, %edx ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: addl %esi, %edx @@ -89,40 +90,40 @@ ; X86-NEXT: subl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: sbbl $0, %ebp -; X86-NEXT: testl %edi, %edi +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X86-NEXT: cmovnsl %ebx, %ebp ; X86-NEXT: cmovnsl %edx, %esi ; X86-NEXT: movl %esi, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: sbbl $0, %edi +; X86-NEXT: movl %ebp, %edx +; X86-NEXT: sbbl $0, %edx ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebp, %edi +; X86-NEXT: cmovnsl %ebp, %edx ; X86-NEXT: cmovnsl %esi, %ecx -; X86-NEXT: testl %edi, %edi +; X86-NEXT: testl %edx, %edx ; X86-NEXT: setg %bl ; X86-NEXT: sete %bh ; X86-NEXT: cmpl $2, %ecx -; X86-NEXT: setae %dl -; X86-NEXT: andb %bh, %dl -; X86-NEXT: orb %bl, %dl -; X86-NEXT: movl (%esp), %ebx -; X86-NEXT: shrdl $2, %eax, %ebx -; X86-NEXT: shrdl $2, %ecx, %eax -; X86-NEXT: testb %dl, %dl +; X86-NEXT: setae %al +; X86-NEXT: andb %bh, %al +; X86-NEXT: orb %bl, %al +; X86-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NEXT: shrdl $2, %edi, %ebx +; X86-NEXT: shrdl $2, %ecx, %edi +; X86-NEXT: testb %al, %al ; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF -; X86-NEXT: cmovel %eax, %esi -; X86-NEXT: movl $-1, %edx -; X86-NEXT: cmovel %ebx, %edx -; X86-NEXT: cmpl $-1, %edi -; X86-NEXT: setl %bl +; X86-NEXT: cmovel %edi, %esi +; X86-NEXT: movl $-1, %edi +; X86-NEXT: cmovel %ebx, %edi +; X86-NEXT: cmpl $-1, %edx +; X86-NEXT: setl %dl ; X86-NEXT: sete %al ; X86-NEXT: cmpl $-2, %ecx ; X86-NEXT: setb %cl ; X86-NEXT: andb %al, %cl ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: orb %bl, %cl -; X86-NEXT: cmovel %edx, %eax +; X86-NEXT: orb %dl, %cl +; X86-NEXT: cmovel %edi, %eax ; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000 ; X86-NEXT: cmovel %esi, %edx ; X86-NEXT: addl $4, %esp @@ -368,65 +369,62 @@ ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $12, %esp -; X86-NEXT: .cfi_def_cfa_offset 32 +; X86-NEXT: subl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 28 ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: imull %edi, %esi -; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: addl %esi, %edx -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: movl %eax, %edi ; X86-NEXT: imull %ebx, %edi +; X86-NEXT: mull %ebx +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: addl %edi, %edx +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: imull %ebp, %ebx +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl %edi, %ebp +; X86-NEXT: imull %ecx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %esi +; X86-NEXT: addl %ebp, %edx +; X86-NEXT: imull %esi, %edi ; X86-NEXT: addl %edx, %edi -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: imull %ecx, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: addl %ebx, %edx -; X86-NEXT: imull %ebp, %esi -; X86-NEXT: addl %edx, %esi ; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edi +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ebp, %edi +; X86-NEXT: addl %eax, %ebp ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %ebp -; X86-NEXT: setb %bl -; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: mull %edx -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movzbl %bl, %edi -; X86-NEXT: adcl %edi, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %eax, %ebp +; X86-NEXT: adcl %ebx, %esi +; X86-NEXT: setb %bl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movzbl %bl, %esi ; X86-NEXT: adcl %esi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: sarl $31, %edi ; X86-NEXT: xorl %edi, %edx ; X86-NEXT: xorl %eax, %edi @@ -437,10 +435,10 @@ ; X86-NEXT: orl %edx, %edi ; X86-NEXT: notl %ecx ; X86-NEXT: cmovel (%esp), %ecx # 4-byte Folded Reload -; X86-NEXT: cmovel %ebx, %esi +; X86-NEXT: cmovel %ebp, %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl %esi, %edx -; X86-NEXT: addl $12, %esp +; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 @@ -651,20 +649,20 @@ ; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: subl %esi, %ebx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: subl %esi, %ecx ; X86-NEXT: movl %edi, %esi ; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X86-NEXT: cmovnsl %edi, %esi -; X86-NEXT: cmovnsl %edx, %ebx -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmovsl %ecx, %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: subl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %esi, %ecx ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X86-NEXT: cmovnsl %esi, %ecx -; X86-NEXT: cmovnsl %ebx, %edx +; X86-NEXT: cmovsl %edi, %edx ; X86-NEXT: testl %edx, %edx ; X86-NEXT: setns {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: sets %bh @@ -674,18 +672,18 @@ ; X86-NEXT: andb %bh, %bl ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload ; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF -; X86-NEXT: cmovel %edx, %esi -; X86-NEXT: movl $-1, %edx -; X86-NEXT: cmovnel %edx, %eax +; X86-NEXT: cmovnel %esi, %edx +; X86-NEXT: movl $-1, %esi +; X86-NEXT: cmovnel %esi, %eax ; X86-NEXT: cmpl $-1, %ecx ; X86-NEXT: setl %cl -; X86-NEXT: sete %dl -; X86-NEXT: andb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: orb %cl, %dl -; X86-NEXT: cmovnel %edi, %eax -; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000 -; X86-NEXT: cmovel %esi, %edx +; X86-NEXT: sete %ch +; X86-NEXT: andb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: orb %cl, %ch +; X86-NEXT: cmovnel %esi, %eax +; X86-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 +; X86-NEXT: cmovnel %ecx, %edx ; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -720,52 +718,51 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %edx, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: adcl %ebx, %edx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %ecx, %edx ; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: imull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: subl %esi, %ebx -; X86-NEXT: movl %edi, %esi -; X86-NEXT: sbbl $0, %esi -; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %edi, %esi -; X86-NEXT: cmovnsl %edx, %ebx -; X86-NEXT: movl %ebx, %edx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %edx, %edi +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %edi, %edx ; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: sbbl $0, %ebp +; X86-NEXT: testl %esi, %esi +; X86-NEXT: cmovnsl %ebx, %ebp +; X86-NEXT: cmovnsl %edi, %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: subl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ebp, %ecx ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %esi, %ecx -; X86-NEXT: cmovnsl %ebx, %edx -; X86-NEXT: shrdl $31, %edx, %eax -; X86-NEXT: shrdl $31, %ecx, %edx +; X86-NEXT: cmovnsl %ebp, %ecx +; X86-NEXT: cmovnsl %edx, %esi +; X86-NEXT: shrdl $31, %esi, %eax +; X86-NEXT: shrdl $31, %ecx, %esi ; X86-NEXT: cmpl $1073741824, %ecx # imm = 0x40000000 -; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF -; X86-NEXT: cmovll %edx, %esi +; X86-NEXT: movl $2147483647, %edi # imm = 0x7FFFFFFF +; X86-NEXT: cmovll %esi, %edi ; X86-NEXT: movl $-1, %edx ; X86-NEXT: cmovgel %edx, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: cmpl $-1073741824, %ecx # imm = 0xC0000000 ; X86-NEXT: cmovll %edx, %eax ; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000 -; X86-NEXT: cmovgel %esi, %edx +; X86-NEXT: cmovgel %edi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx Index: llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll =================================================================== --- llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -47,11 +47,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,171798690,42949672] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,171798690,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -417,11 +417,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -511,11 +511,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -606,10 +606,10 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,858993458,268435454,858993458] -; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,268435454,858993458] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq @@ -696,10 +696,10 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,268435454,306783378] -; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,268435454,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq @@ -790,11 +790,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,268435454,42949672] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,268435454,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -1025,11 +1025,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -1450,10 +1450,10 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,4294967295,268435454,858993458] -; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,858993458] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq @@ -1544,11 +1544,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,4294967295,268435454,306783378] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435454,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -1640,11 +1640,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,4294967295,268435454,42949672] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -1875,11 +1875,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,4294967295,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -1973,11 +1973,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,268435454,4294967295,858993458] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,268435454,4294967295,858993458] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -2069,11 +2069,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,268435454,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,268435454,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -2165,11 +2165,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,268435454,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,268435454,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; Index: llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll =================================================================== --- llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll +++ llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -332,9 +332,8 @@ ; SSE: # %bb.0: ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE-NEXT: addss %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: addss %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; ; AVX1-LABEL: test13: Index: llvm/test/CodeGen/X86/uadd_sat.ll =================================================================== --- llvm/test/CodeGen/X86/uadd_sat.ll +++ llvm/test/CodeGen/X86/uadd_sat.ll @@ -151,11 +151,12 @@ ; X64-LABEL: vec: ; X64: # %bb.0: ; X64-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; X64-NEXT: paddd %xmm0, %xmm1 -; X64-NEXT: pxor %xmm2, %xmm0 -; X64-NEXT: pxor %xmm1, %xmm2 -; X64-NEXT: pcmpgtd %xmm2, %xmm0 -; X64-NEXT: por %xmm1, %xmm0 +; X64-NEXT: movdqa %xmm0, %xmm3 +; X64-NEXT: pxor %xmm2, %xmm3 +; X64-NEXT: paddd %xmm1, %xmm0 +; X64-NEXT: pxor %xmm0, %xmm2 +; X64-NEXT: pcmpgtd %xmm2, %xmm3 +; X64-NEXT: por %xmm3, %xmm0 ; X64-NEXT: retq %tmp = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %tmp Index: llvm/test/CodeGen/X86/uadd_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/uadd_sat_vec.ll +++ llvm/test/CodeGen/X86/uadd_sat_vec.ll @@ -522,21 +522,23 @@ ; SSE2-LABEL: v2i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v2i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: paddd %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: por %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i32: @@ -588,21 +590,23 @@ ; SSE2-LABEL: v4i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v4i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: paddd %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: por %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v4i32: @@ -654,33 +658,37 @@ ; SSE2-LABEL: v8i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: paddd %xmm0, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: movdqa %xmm0, %xmm5 ; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 -; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: paddd %xmm1, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 -; SSSE3-NEXT: por %xmm3, %xmm1 +; SSSE3-NEXT: paddd %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 +; SSSE3-NEXT: por %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: paddd %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 +; SSSE3-NEXT: por %xmm2, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v8i32: @@ -741,57 +749,65 @@ ; SSE2-LABEL: v16i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm9 ; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm7 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pxor %xmm7, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm3 -; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: paddd %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: paddd %xmm7, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v16i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: paddd %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm8, %xmm0 -; SSSE3-NEXT: movdqa %xmm4, %xmm9 +; SSSE3-NEXT: movdqa %xmm0, %xmm9 ; SSSE3-NEXT: pxor %xmm8, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 -; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: paddd %xmm1, %xmm5 -; SSSE3-NEXT: pxor %xmm8, %xmm1 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: paddd %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pxor %xmm8, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 -; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: paddd %xmm2, %xmm6 -; SSSE3-NEXT: pxor %xmm8, %xmm2 -; SSSE3-NEXT: movdqa %xmm6, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm9 +; SSSE3-NEXT: por %xmm9, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSSE3-NEXT: pxor %xmm8, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 -; SSSE3-NEXT: por %xmm6, %xmm2 -; SSSE3-NEXT: paddd %xmm3, %xmm7 -; SSSE3-NEXT: pxor %xmm8, %xmm3 -; SSSE3-NEXT: pxor %xmm7, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: paddd %xmm5, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm5 +; SSSE3-NEXT: pxor %xmm8, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 +; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: paddd %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm8, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 +; SSSE3-NEXT: por %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: paddd %xmm7, %xmm3 +; SSSE3-NEXT: pxor %xmm3, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm4, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v16i32: @@ -863,16 +879,17 @@ ; SSE-LABEL: v2i64: ; SSE: # %bb.0: ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSE-NEXT: paddq %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: paddq %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: retq @@ -923,31 +940,33 @@ ; SSE-LABEL: v4i64: ; SSE: # %bb.0: ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] -; SSE-NEXT: paddq %xmm0, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: pxor %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE-NEXT: paddq %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pcmpgtd %xmm2, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: paddq %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm1 -; SSE-NEXT: pxor %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: paddq %xmm3, %xmm1 +; SSE-NEXT: pxor %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: v4i64: @@ -1003,57 +1022,61 @@ ; SSE-LABEL: v8i64: ; SSE: # %bb.0: ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] -; SSE-NEXT: paddq %xmm0, %xmm4 -; SSE-NEXT: pxor %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm0, %xmm9 ; SSE-NEXT: pxor %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE-NEXT: paddq %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pxor %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pcmpgtd %xmm4, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] ; SSE-NEXT: pand %xmm11, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3] ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: paddq %xmm1, %xmm5 -; SSE-NEXT: pxor %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pxor %xmm8, %xmm4 ; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE-NEXT: pxor %xmm8, %xmm9 +; SSE-NEXT: paddq %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pxor %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE-NEXT: pand %xmm10, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,3,3] -; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: paddq %xmm2, %xmm6 -; SSE-NEXT: pxor %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pxor %xmm8, %xmm4 +; SSE-NEXT: paddq %xmm6, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE-NEXT: pxor %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE-NEXT: por %xmm5, %xmm2 ; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: paddq %xmm3, %xmm7 -; SSE-NEXT: pxor %xmm8, %xmm3 -; SSE-NEXT: pxor %xmm7, %xmm8 ; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pcmpgtd %xmm8, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE-NEXT: pcmpeqd %xmm3, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pxor %xmm8, %xmm4 +; SSE-NEXT: paddq %xmm7, %xmm3 +; SSE-NEXT: pxor %xmm3, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pcmpgtd %xmm8, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm4, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3] +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: retq ; ; AVX1-LABEL: v8i64: Index: llvm/test/CodeGen/X86/umul-with-overflow.ll =================================================================== --- llvm/test/CodeGen/X86/umul-with-overflow.ll +++ llvm/test/CodeGen/X86/umul-with-overflow.ll @@ -34,9 +34,10 @@ ; ; X64-LABEL: test2: ; X64: # %bb.0: # %entry +; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: addl %esi, %edi -; X64-NEXT: leal (%rdi,%rdi), %eax +; X64-NEXT: leal (%rdi,%rsi), %eax +; X64-NEXT: addl %eax, %eax ; X64-NEXT: retq entry: %tmp0 = add i32 %b, %a Index: llvm/test/CodeGen/X86/umul_fix.ll =================================================================== --- llvm/test/CodeGen/X86/umul_fix.ll +++ llvm/test/CodeGen/X86/umul_fix.ll @@ -45,19 +45,19 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: mull %ebp ; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: addl %edx, %ebx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %edi, %edx ; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx @@ -306,31 +306,31 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ebx ; X86-NEXT: addl %edx, %edi -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx ; X86-NEXT: addl %edi, %eax -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %ebp, %edx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: shldl $1, %edx, %ecx -; X86-NEXT: shrdl $31, %edx, %eax -; X86-NEXT: movl %ecx, %edx +; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: shldl $1, %ecx, %esi +; X86-NEXT: shrdl $31, %ecx, %eax +; X86-NEXT: movl %esi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx Index: llvm/test/CodeGen/X86/umul_fix_sat.ll =================================================================== --- llvm/test/CodeGen/X86/umul_fix_sat.ll +++ llvm/test/CodeGen/X86/umul_fix_sat.ll @@ -52,27 +52,26 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %ebx +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %esi, %ebp +; X86-NEXT: addl %edx, %ebx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %edi, %edx ; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl %ebx, %edx +; X86-NEXT: addl %ebp, %edx ; X86-NEXT: adcl $0, %esi ; X86-NEXT: shrdl $2, %eax, %ecx ; X86-NEXT: shrdl $2, %edx, %eax Index: llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll =================================================================== --- llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -41,11 +41,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,171798691,42949672] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,171798691,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -197,10 +197,10 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,1,306783378] -; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,1,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq @@ -275,10 +275,10 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783379,306783379,2,306783379] -; CHECK-SSE41-NEXT: pmaxud %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783379,306783379,2,306783379] +; CHECK-SSE41-NEXT: pmaxud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq @@ -360,11 +360,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,1,42949672] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,1,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -445,11 +445,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993460,306783379,2,42949673] -; CHECK-SSE41-NEXT: pmaxud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993460,306783379,2,42949673] +; CHECK-SSE41-NEXT: pmaxud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -529,10 +529,10 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,268435455,858993459] -; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,268435455,858993459] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq @@ -609,10 +609,10 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,268435455,306783378] -; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,268435455,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq @@ -694,11 +694,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,268435455,42949672] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,268435455,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -908,11 +908,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -992,10 +992,10 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,1,858993459] -; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,1,858993459] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq @@ -1072,10 +1072,10 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,1,306783378] -; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,1,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq @@ -1157,11 +1157,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,1,42949672] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,1,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -1243,10 +1243,10 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,268435455,858993459] -; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,268435455,858993459] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq @@ -1328,11 +1328,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,1,268435455,306783378] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,1,268435455,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -1415,11 +1415,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,268435455,42949672] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,268435455,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -1543,11 +1543,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,1,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,1,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -1630,11 +1630,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -1719,11 +1719,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,268435455,4294967295,858993459] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,268435455,4294967295,858993459] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -1806,11 +1806,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,268435455,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,268435455,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -1893,11 +1893,11 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: por %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,268435455,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,268435455,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; @@ -1978,10 +1978,10 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,268435455,4294967295] -; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,268435455,4294967295] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq @@ -2059,10 +2059,10 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,1,268435455,4294967295] -; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,1,268435455,4294967295] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq Index: llvm/test/CodeGen/X86/vec-strict-cmp-128.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-cmp-128.ll +++ llvm/test/CodeGen/X86/vec-strict-cmp-128.ll @@ -723,11 +723,11 @@ ; SSE-32-NEXT: movaps 8(%ebp), %xmm3 ; SSE-32-NEXT: movaps %xmm2, %xmm4 ; SSE-32-NEXT: cmpneqps %xmm3, %xmm4 -; SSE-32-NEXT: cmpordps %xmm3, %xmm2 -; SSE-32-NEXT: andps %xmm4, %xmm2 -; SSE-32-NEXT: andps %xmm2, %xmm0 -; SSE-32-NEXT: andnps %xmm1, %xmm2 -; SSE-32-NEXT: orps %xmm2, %xmm0 +; SSE-32-NEXT: cmpordps %xmm2, %xmm3 +; SSE-32-NEXT: andps %xmm4, %xmm3 +; SSE-32-NEXT: andps %xmm3, %xmm0 +; SSE-32-NEXT: andnps %xmm1, %xmm3 +; SSE-32-NEXT: orps %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -737,10 +737,10 @@ ; SSE-64-NEXT: movaps %xmm2, %xmm4 ; SSE-64-NEXT: cmpneqps %xmm3, %xmm4 ; SSE-64-NEXT: cmpordps %xmm3, %xmm2 -; SSE-64-NEXT: andps %xmm4, %xmm2 -; SSE-64-NEXT: andps %xmm2, %xmm0 -; SSE-64-NEXT: andnps %xmm1, %xmm2 -; SSE-64-NEXT: orps %xmm2, %xmm0 +; SSE-64-NEXT: andps %xmm2, %xmm4 +; SSE-64-NEXT: andps %xmm4, %xmm0 +; SSE-64-NEXT: andnps %xmm1, %xmm4 +; SSE-64-NEXT: orps %xmm4, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_one_q: @@ -916,11 +916,11 @@ ; SSE-32-NEXT: movaps 8(%ebp), %xmm3 ; SSE-32-NEXT: movaps %xmm2, %xmm4 ; SSE-32-NEXT: cmpeqps %xmm3, %xmm4 -; SSE-32-NEXT: cmpunordps %xmm3, %xmm2 -; SSE-32-NEXT: orps %xmm4, %xmm2 -; SSE-32-NEXT: andps %xmm2, %xmm0 -; SSE-32-NEXT: andnps %xmm1, %xmm2 -; SSE-32-NEXT: orps %xmm2, %xmm0 +; SSE-32-NEXT: cmpunordps %xmm2, %xmm3 +; SSE-32-NEXT: orps %xmm4, %xmm3 +; SSE-32-NEXT: andps %xmm3, %xmm0 +; SSE-32-NEXT: andnps %xmm1, %xmm3 +; SSE-32-NEXT: orps %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -930,10 +930,10 @@ ; SSE-64-NEXT: movaps %xmm2, %xmm4 ; SSE-64-NEXT: cmpeqps %xmm3, %xmm4 ; SSE-64-NEXT: cmpunordps %xmm3, %xmm2 -; SSE-64-NEXT: orps %xmm4, %xmm2 -; SSE-64-NEXT: andps %xmm2, %xmm0 -; SSE-64-NEXT: andnps %xmm1, %xmm2 -; SSE-64-NEXT: orps %xmm2, %xmm0 +; SSE-64-NEXT: orps %xmm2, %xmm4 +; SSE-64-NEXT: andps %xmm4, %xmm0 +; SSE-64-NEXT: andnps %xmm1, %xmm4 +; SSE-64-NEXT: orps %xmm4, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_ueq_q: @@ -2383,11 +2383,11 @@ ; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: movapd %xmm2, %xmm4 ; SSE-32-NEXT: cmpneqpd %xmm3, %xmm4 -; SSE-32-NEXT: cmpordpd %xmm3, %xmm2 -; SSE-32-NEXT: andpd %xmm4, %xmm2 -; SSE-32-NEXT: andpd %xmm2, %xmm0 -; SSE-32-NEXT: andnpd %xmm1, %xmm2 -; SSE-32-NEXT: orpd %xmm2, %xmm0 +; SSE-32-NEXT: cmpordpd %xmm2, %xmm3 +; SSE-32-NEXT: andpd %xmm4, %xmm3 +; SSE-32-NEXT: andpd %xmm3, %xmm0 +; SSE-32-NEXT: andnpd %xmm1, %xmm3 +; SSE-32-NEXT: orpd %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2397,10 +2397,10 @@ ; SSE-64-NEXT: movapd %xmm2, %xmm4 ; SSE-64-NEXT: cmpneqpd %xmm3, %xmm4 ; SSE-64-NEXT: cmpordpd %xmm3, %xmm2 -; SSE-64-NEXT: andpd %xmm4, %xmm2 -; SSE-64-NEXT: andpd %xmm2, %xmm0 -; SSE-64-NEXT: andnpd %xmm1, %xmm2 -; SSE-64-NEXT: orpd %xmm2, %xmm0 +; SSE-64-NEXT: andpd %xmm2, %xmm4 +; SSE-64-NEXT: andpd %xmm4, %xmm0 +; SSE-64-NEXT: andnpd %xmm1, %xmm4 +; SSE-64-NEXT: orpd %xmm4, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v2f64_one_q: @@ -2576,11 +2576,11 @@ ; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: movapd %xmm2, %xmm4 ; SSE-32-NEXT: cmpeqpd %xmm3, %xmm4 -; SSE-32-NEXT: cmpunordpd %xmm3, %xmm2 -; SSE-32-NEXT: orpd %xmm4, %xmm2 -; SSE-32-NEXT: andpd %xmm2, %xmm0 -; SSE-32-NEXT: andnpd %xmm1, %xmm2 -; SSE-32-NEXT: orpd %xmm2, %xmm0 +; SSE-32-NEXT: cmpunordpd %xmm2, %xmm3 +; SSE-32-NEXT: orpd %xmm4, %xmm3 +; SSE-32-NEXT: andpd %xmm3, %xmm0 +; SSE-32-NEXT: andnpd %xmm1, %xmm3 +; SSE-32-NEXT: orpd %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2590,10 +2590,10 @@ ; SSE-64-NEXT: movapd %xmm2, %xmm4 ; SSE-64-NEXT: cmpeqpd %xmm3, %xmm4 ; SSE-64-NEXT: cmpunordpd %xmm3, %xmm2 -; SSE-64-NEXT: orpd %xmm4, %xmm2 -; SSE-64-NEXT: andpd %xmm2, %xmm0 -; SSE-64-NEXT: andnpd %xmm1, %xmm2 -; SSE-64-NEXT: orpd %xmm2, %xmm0 +; SSE-64-NEXT: orpd %xmm2, %xmm4 +; SSE-64-NEXT: andpd %xmm4, %xmm0 +; SSE-64-NEXT: andnpd %xmm1, %xmm4 +; SSE-64-NEXT: orpd %xmm4, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v2f64_ueq_q: @@ -3338,10 +3338,10 @@ ; SSE-32-NEXT: movaps 8(%ebp), %xmm3 ; SSE-32-NEXT: movaps %xmm2, %xmm4 ; SSE-32-NEXT: cmpltps %xmm3, %xmm4 -; SSE-32-NEXT: cmpeqps %xmm3, %xmm2 -; SSE-32-NEXT: andps %xmm2, %xmm0 -; SSE-32-NEXT: andnps %xmm1, %xmm2 -; SSE-32-NEXT: orps %xmm2, %xmm0 +; SSE-32-NEXT: cmpeqps %xmm2, %xmm3 +; SSE-32-NEXT: andps %xmm3, %xmm0 +; SSE-32-NEXT: andnps %xmm1, %xmm3 +; SSE-32-NEXT: orps %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -3816,11 +3816,11 @@ ; SSE-32-NEXT: cmpltps %xmm3, %xmm4 ; SSE-32-NEXT: movaps %xmm2, %xmm4 ; SSE-32-NEXT: cmpneqps %xmm3, %xmm4 -; SSE-32-NEXT: cmpordps %xmm3, %xmm2 -; SSE-32-NEXT: andps %xmm4, %xmm2 -; SSE-32-NEXT: andps %xmm2, %xmm0 -; SSE-32-NEXT: andnps %xmm1, %xmm2 -; SSE-32-NEXT: orps %xmm2, %xmm0 +; SSE-32-NEXT: cmpordps %xmm2, %xmm3 +; SSE-32-NEXT: andps %xmm4, %xmm3 +; SSE-32-NEXT: andps %xmm3, %xmm0 +; SSE-32-NEXT: andnps %xmm1, %xmm3 +; SSE-32-NEXT: orps %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -3832,10 +3832,10 @@ ; SSE-64-NEXT: movaps %xmm2, %xmm4 ; SSE-64-NEXT: cmpneqps %xmm3, %xmm4 ; SSE-64-NEXT: cmpordps %xmm3, %xmm2 -; SSE-64-NEXT: andps %xmm4, %xmm2 -; SSE-64-NEXT: andps %xmm2, %xmm0 -; SSE-64-NEXT: andnps %xmm1, %xmm2 -; SSE-64-NEXT: orps %xmm2, %xmm0 +; SSE-64-NEXT: andps %xmm2, %xmm4 +; SSE-64-NEXT: andps %xmm4, %xmm0 +; SSE-64-NEXT: andnps %xmm1, %xmm4 +; SSE-64-NEXT: orps %xmm4, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_one_s: @@ -3918,10 +3918,10 @@ ; SSE-32-NEXT: movaps 8(%ebp), %xmm3 ; SSE-32-NEXT: movaps %xmm2, %xmm4 ; SSE-32-NEXT: cmpltps %xmm3, %xmm4 -; SSE-32-NEXT: cmpordps %xmm3, %xmm2 -; SSE-32-NEXT: andps %xmm2, %xmm0 -; SSE-32-NEXT: andnps %xmm1, %xmm2 -; SSE-32-NEXT: orps %xmm2, %xmm0 +; SSE-32-NEXT: cmpordps %xmm2, %xmm3 +; SSE-32-NEXT: andps %xmm3, %xmm0 +; SSE-32-NEXT: andnps %xmm1, %xmm3 +; SSE-32-NEXT: orps %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -4018,11 +4018,11 @@ ; SSE-32-NEXT: cmpltps %xmm3, %xmm4 ; SSE-32-NEXT: movaps %xmm2, %xmm4 ; SSE-32-NEXT: cmpeqps %xmm3, %xmm4 -; SSE-32-NEXT: cmpunordps %xmm3, %xmm2 -; SSE-32-NEXT: orps %xmm4, %xmm2 -; SSE-32-NEXT: andps %xmm2, %xmm0 -; SSE-32-NEXT: andnps %xmm1, %xmm2 -; SSE-32-NEXT: orps %xmm2, %xmm0 +; SSE-32-NEXT: cmpunordps %xmm2, %xmm3 +; SSE-32-NEXT: orps %xmm4, %xmm3 +; SSE-32-NEXT: andps %xmm3, %xmm0 +; SSE-32-NEXT: andnps %xmm1, %xmm3 +; SSE-32-NEXT: orps %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -4034,10 +4034,10 @@ ; SSE-64-NEXT: movaps %xmm2, %xmm4 ; SSE-64-NEXT: cmpeqps %xmm3, %xmm4 ; SSE-64-NEXT: cmpunordps %xmm3, %xmm2 -; SSE-64-NEXT: orps %xmm4, %xmm2 -; SSE-64-NEXT: andps %xmm2, %xmm0 -; SSE-64-NEXT: andnps %xmm1, %xmm2 -; SSE-64-NEXT: orps %xmm2, %xmm0 +; SSE-64-NEXT: orps %xmm2, %xmm4 +; SSE-64-NEXT: andps %xmm4, %xmm0 +; SSE-64-NEXT: andnps %xmm1, %xmm4 +; SSE-64-NEXT: orps %xmm4, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_ueq_s: @@ -4498,10 +4498,10 @@ ; SSE-32-NEXT: movaps 8(%ebp), %xmm3 ; SSE-32-NEXT: movaps %xmm2, %xmm4 ; SSE-32-NEXT: cmpltps %xmm3, %xmm4 -; SSE-32-NEXT: cmpneqps %xmm3, %xmm2 -; SSE-32-NEXT: andps %xmm2, %xmm0 -; SSE-32-NEXT: andnps %xmm1, %xmm2 -; SSE-32-NEXT: orps %xmm2, %xmm0 +; SSE-32-NEXT: cmpneqps %xmm2, %xmm3 +; SSE-32-NEXT: andps %xmm3, %xmm0 +; SSE-32-NEXT: andnps %xmm1, %xmm3 +; SSE-32-NEXT: orps %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -4596,10 +4596,10 @@ ; SSE-32-NEXT: movaps 8(%ebp), %xmm3 ; SSE-32-NEXT: movaps %xmm2, %xmm4 ; SSE-32-NEXT: cmpltps %xmm3, %xmm4 -; SSE-32-NEXT: cmpunordps %xmm3, %xmm2 -; SSE-32-NEXT: andps %xmm2, %xmm0 -; SSE-32-NEXT: andnps %xmm1, %xmm2 -; SSE-32-NEXT: orps %xmm2, %xmm0 +; SSE-32-NEXT: cmpunordps %xmm2, %xmm3 +; SSE-32-NEXT: andps %xmm3, %xmm0 +; SSE-32-NEXT: andnps %xmm1, %xmm3 +; SSE-32-NEXT: orps %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -4694,10 +4694,10 @@ ; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: movapd %xmm2, %xmm4 ; SSE-32-NEXT: cmpltpd %xmm3, %xmm4 -; SSE-32-NEXT: cmpeqpd %xmm3, %xmm2 -; SSE-32-NEXT: andpd %xmm2, %xmm0 -; SSE-32-NEXT: andnpd %xmm1, %xmm2 -; SSE-32-NEXT: orpd %xmm2, %xmm0 +; SSE-32-NEXT: cmpeqpd %xmm2, %xmm3 +; SSE-32-NEXT: andpd %xmm3, %xmm0 +; SSE-32-NEXT: andnpd %xmm1, %xmm3 +; SSE-32-NEXT: orpd %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -5172,11 +5172,11 @@ ; SSE-32-NEXT: cmpltpd %xmm3, %xmm4 ; SSE-32-NEXT: movapd %xmm2, %xmm4 ; SSE-32-NEXT: cmpneqpd %xmm3, %xmm4 -; SSE-32-NEXT: cmpordpd %xmm3, %xmm2 -; SSE-32-NEXT: andpd %xmm4, %xmm2 -; SSE-32-NEXT: andpd %xmm2, %xmm0 -; SSE-32-NEXT: andnpd %xmm1, %xmm2 -; SSE-32-NEXT: orpd %xmm2, %xmm0 +; SSE-32-NEXT: cmpordpd %xmm2, %xmm3 +; SSE-32-NEXT: andpd %xmm4, %xmm3 +; SSE-32-NEXT: andpd %xmm3, %xmm0 +; SSE-32-NEXT: andnpd %xmm1, %xmm3 +; SSE-32-NEXT: orpd %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -5188,10 +5188,10 @@ ; SSE-64-NEXT: movapd %xmm2, %xmm4 ; SSE-64-NEXT: cmpneqpd %xmm3, %xmm4 ; SSE-64-NEXT: cmpordpd %xmm3, %xmm2 -; SSE-64-NEXT: andpd %xmm4, %xmm2 -; SSE-64-NEXT: andpd %xmm2, %xmm0 -; SSE-64-NEXT: andnpd %xmm1, %xmm2 -; SSE-64-NEXT: orpd %xmm2, %xmm0 +; SSE-64-NEXT: andpd %xmm2, %xmm4 +; SSE-64-NEXT: andpd %xmm4, %xmm0 +; SSE-64-NEXT: andnpd %xmm1, %xmm4 +; SSE-64-NEXT: orpd %xmm4, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v2f64_one_s: @@ -5274,10 +5274,10 @@ ; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: movapd %xmm2, %xmm4 ; SSE-32-NEXT: cmpltpd %xmm3, %xmm4 -; SSE-32-NEXT: cmpordpd %xmm3, %xmm2 -; SSE-32-NEXT: andpd %xmm2, %xmm0 -; SSE-32-NEXT: andnpd %xmm1, %xmm2 -; SSE-32-NEXT: orpd %xmm2, %xmm0 +; SSE-32-NEXT: cmpordpd %xmm2, %xmm3 +; SSE-32-NEXT: andpd %xmm3, %xmm0 +; SSE-32-NEXT: andnpd %xmm1, %xmm3 +; SSE-32-NEXT: orpd %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -5374,11 +5374,11 @@ ; SSE-32-NEXT: cmpltpd %xmm3, %xmm4 ; SSE-32-NEXT: movapd %xmm2, %xmm4 ; SSE-32-NEXT: cmpeqpd %xmm3, %xmm4 -; SSE-32-NEXT: cmpunordpd %xmm3, %xmm2 -; SSE-32-NEXT: orpd %xmm4, %xmm2 -; SSE-32-NEXT: andpd %xmm2, %xmm0 -; SSE-32-NEXT: andnpd %xmm1, %xmm2 -; SSE-32-NEXT: orpd %xmm2, %xmm0 +; SSE-32-NEXT: cmpunordpd %xmm2, %xmm3 +; SSE-32-NEXT: orpd %xmm4, %xmm3 +; SSE-32-NEXT: andpd %xmm3, %xmm0 +; SSE-32-NEXT: andnpd %xmm1, %xmm3 +; SSE-32-NEXT: orpd %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -5390,10 +5390,10 @@ ; SSE-64-NEXT: movapd %xmm2, %xmm4 ; SSE-64-NEXT: cmpeqpd %xmm3, %xmm4 ; SSE-64-NEXT: cmpunordpd %xmm3, %xmm2 -; SSE-64-NEXT: orpd %xmm4, %xmm2 -; SSE-64-NEXT: andpd %xmm2, %xmm0 -; SSE-64-NEXT: andnpd %xmm1, %xmm2 -; SSE-64-NEXT: orpd %xmm2, %xmm0 +; SSE-64-NEXT: orpd %xmm2, %xmm4 +; SSE-64-NEXT: andpd %xmm4, %xmm0 +; SSE-64-NEXT: andnpd %xmm1, %xmm4 +; SSE-64-NEXT: orpd %xmm4, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v2f64_ueq_s: @@ -5854,10 +5854,10 @@ ; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: movapd %xmm2, %xmm4 ; SSE-32-NEXT: cmpltpd %xmm3, %xmm4 -; SSE-32-NEXT: cmpneqpd %xmm3, %xmm2 -; SSE-32-NEXT: andpd %xmm2, %xmm0 -; SSE-32-NEXT: andnpd %xmm1, %xmm2 -; SSE-32-NEXT: orpd %xmm2, %xmm0 +; SSE-32-NEXT: cmpneqpd %xmm2, %xmm3 +; SSE-32-NEXT: andpd %xmm3, %xmm0 +; SSE-32-NEXT: andnpd %xmm1, %xmm3 +; SSE-32-NEXT: orpd %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -5952,10 +5952,10 @@ ; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: movapd %xmm2, %xmm4 ; SSE-32-NEXT: cmpltpd %xmm3, %xmm4 -; SSE-32-NEXT: cmpunordpd %xmm3, %xmm2 -; SSE-32-NEXT: andpd %xmm2, %xmm0 -; SSE-32-NEXT: andnpd %xmm1, %xmm2 -; SSE-32-NEXT: orpd %xmm2, %xmm0 +; SSE-32-NEXT: cmpunordpd %xmm2, %xmm3 +; SSE-32-NEXT: andpd %xmm3, %xmm0 +; SSE-32-NEXT: andnpd %xmm1, %xmm3 +; SSE-32-NEXT: orpd %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl Index: llvm/test/CodeGen/X86/vec_ctbits.ll =================================================================== --- llvm/test/CodeGen/X86/vec_ctbits.ll +++ llvm/test/CodeGen/X86/vec_ctbits.ll @@ -37,40 +37,40 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlq $1, %xmm1 -; CHECK-NEXT: por %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrlq $2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlq $2, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlq $4, %xmm1 -; CHECK-NEXT: por %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrlq $8, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlq $8, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlq $16, %xmm1 -; CHECK-NEXT: por %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrlq $32, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlq $32, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrlw $1, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlw $1, %xmm1 +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: psubb %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pand %xmm1, %xmm2 +; CHECK-NEXT: psrlw $2, %xmm0 +; CHECK-NEXT: pand %xmm1, %xmm0 +; CHECK-NEXT: paddb %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: paddb %xmm1, %xmm0 ; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: psubb %xmm0, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: psrlw $2, %xmm1 -; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: paddb %xmm2, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: psrlw $4, %xmm2 -; CHECK-NEXT: paddb %xmm1, %xmm2 -; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: psadbw %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psadbw %xmm1, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true) ret <2 x i64> %c @@ -123,16 +123,15 @@ ; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $4, %xmm1 -; CHECK-NEXT: paddb %xmm0, %xmm1 -; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; CHECK-NEXT: psadbw %xmm0, %xmm2 -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: psadbw %xmm0, %xmm1 -; CHECK-NEXT: packuswb %xmm2, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: paddb %xmm1, %xmm0 +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; CHECK-NEXT: psadbw %xmm1, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: psadbw %xmm1, %xmm0 +; CHECK-NEXT: packuswb %xmm2, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false) ret <2 x i32> %c @@ -143,34 +142,34 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrld $1, %xmm1 -; CHECK-NEXT: por %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrld $2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $2, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrld $4, %xmm1 -; CHECK-NEXT: por %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrld $8, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $8, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrld $16, %xmm1 -; CHECK-NEXT: por %xmm0, %xmm1 -; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-NEXT: pxor %xmm1, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlw $1, %xmm0 -; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: psubb %xmm0, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; CHECK-NEXT: movdqa %xmm2, %xmm1 -; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: psrlw $2, %xmm2 -; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: paddb %xmm1, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlw $4, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlw $1, %xmm1 +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: psubb %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pand %xmm1, %xmm2 +; CHECK-NEXT: psrlw $2, %xmm0 +; CHECK-NEXT: pand %xmm1, %xmm0 ; CHECK-NEXT: paddb %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: paddb %xmm1, %xmm0 ; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: pxor %xmm1, %xmm1 ; CHECK-NEXT: movdqa %xmm0, %xmm2 @@ -200,16 +199,15 @@ ; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $4, %xmm1 -; CHECK-NEXT: paddb %xmm0, %xmm1 -; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; CHECK-NEXT: psadbw %xmm0, %xmm2 -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: psadbw %xmm0, %xmm1 -; CHECK-NEXT: packuswb %xmm2, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: paddb %xmm1, %xmm0 +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; CHECK-NEXT: psadbw %xmm1, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: psadbw %xmm1, %xmm0 +; CHECK-NEXT: packuswb %xmm2, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) ret <2 x i32> %c Index: llvm/test/CodeGen/X86/vec_umulo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_umulo.ll +++ llvm/test/CodeGen/X86/vec_umulo.ll @@ -1094,9 +1094,9 @@ ; SSE2-NEXT: psrlw $8, %xmm5 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: packuswb %xmm5, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $24, %xmm0 @@ -1135,9 +1135,9 @@ ; SSSE3-NEXT: psrlw $8, %xmm5 ; SSSE3-NEXT: psrlw $8, %xmm0 ; SSSE3-NEXT: packuswb %xmm5, %xmm0 -; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0 +; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: psrad $24, %xmm0 @@ -2385,11 +2385,11 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pmullw %xmm1, %xmm2 -; SSE2-NEXT: pmulhuw %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqw %xmm0, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: pmulhuw %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqw %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] @@ -2402,11 +2402,11 @@ ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pmullw %xmm1, %xmm2 -; SSSE3-NEXT: pmulhuw %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pcmpeqw %xmm0, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSSE3-NEXT: pmulhuw %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pcmpeqw %xmm0, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm0, %xmm1 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSSE3-NEXT: psrad $16, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] @@ -2419,11 +2419,11 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pmullw %xmm1, %xmm2 -; SSE41-NEXT: pmulhuw %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pcmpeqw %xmm0, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm3, %xmm1 +; SSE41-NEXT: pmulhuw %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pcmpeqw %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm0, %xmm1 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE41-NEXT: pslld $31, %xmm1 Index: llvm/test/CodeGen/X86/vector-bitreverse.ll =================================================================== --- llvm/test/CodeGen/X86/vector-bitreverse.ll +++ llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -25,13 +25,13 @@ ; SSE-NEXT: shlb $2, %al ; SSE-NEXT: shrb $2, %dil ; SSE-NEXT: andb $51, %dil -; SSE-NEXT: orb %al, %dil -; SSE-NEXT: movl %edi, %eax -; SSE-NEXT: andb $85, %al -; SSE-NEXT: addb %al, %al -; SSE-NEXT: shrb %dil -; SSE-NEXT: andb $85, %dil ; SSE-NEXT: orb %dil, %al +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: andb $85, %cl +; SSE-NEXT: addb %cl, %cl +; SSE-NEXT: shrb %al +; SSE-NEXT: andb $85, %al +; SSE-NEXT: orb %cl, %al ; SSE-NEXT: retq ; ; AVX-LABEL: test_bitreverse_i8: @@ -42,13 +42,13 @@ ; AVX-NEXT: shlb $2, %al ; AVX-NEXT: shrb $2, %dil ; AVX-NEXT: andb $51, %dil -; AVX-NEXT: orb %al, %dil -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: andb $85, %al -; AVX-NEXT: addb %al, %al -; AVX-NEXT: shrb %dil -; AVX-NEXT: andb $85, %dil ; AVX-NEXT: orb %dil, %al +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: andb $85, %cl +; AVX-NEXT: addb %cl, %cl +; AVX-NEXT: shrb %al +; AVX-NEXT: andb $85, %al +; AVX-NEXT: orb %cl, %al ; AVX-NEXT: retq ; ; XOP-LABEL: test_bitreverse_i8: @@ -67,13 +67,13 @@ ; GFNISSE-NEXT: shlb $2, %al ; GFNISSE-NEXT: shrb $2, %dil ; GFNISSE-NEXT: andb $51, %dil -; GFNISSE-NEXT: orb %al, %dil -; GFNISSE-NEXT: movl %edi, %eax -; GFNISSE-NEXT: andb $85, %al -; GFNISSE-NEXT: addb %al, %al -; GFNISSE-NEXT: shrb %dil -; GFNISSE-NEXT: andb $85, %dil ; GFNISSE-NEXT: orb %dil, %al +; GFNISSE-NEXT: movl %eax, %ecx +; GFNISSE-NEXT: andb $85, %cl +; GFNISSE-NEXT: addb %cl, %cl +; GFNISSE-NEXT: shrb %al +; GFNISSE-NEXT: andb $85, %al +; GFNISSE-NEXT: orb %cl, %al ; GFNISSE-NEXT: retq ; ; GFNIAVX-LABEL: test_bitreverse_i8: @@ -84,13 +84,13 @@ ; GFNIAVX-NEXT: shlb $2, %al ; GFNIAVX-NEXT: shrb $2, %dil ; GFNIAVX-NEXT: andb $51, %dil -; GFNIAVX-NEXT: orb %al, %dil -; GFNIAVX-NEXT: movl %edi, %eax -; GFNIAVX-NEXT: andb $85, %al -; GFNIAVX-NEXT: addb %al, %al -; GFNIAVX-NEXT: shrb %dil -; GFNIAVX-NEXT: andb $85, %dil ; GFNIAVX-NEXT: orb %dil, %al +; GFNIAVX-NEXT: movl %eax, %ecx +; GFNIAVX-NEXT: andb $85, %cl +; GFNIAVX-NEXT: addb %cl, %cl +; GFNIAVX-NEXT: shrb %al +; GFNIAVX-NEXT: andb $85, %al +; GFNIAVX-NEXT: orb %cl, %al ; GFNIAVX-NEXT: retq ; ; GFNIAVX2-LABEL: test_bitreverse_i8: @@ -101,13 +101,13 @@ ; GFNIAVX2-NEXT: shlb $2, %al ; GFNIAVX2-NEXT: shrb $2, %dil ; GFNIAVX2-NEXT: andb $51, %dil -; GFNIAVX2-NEXT: orb %al, %dil -; GFNIAVX2-NEXT: movl %edi, %eax -; GFNIAVX2-NEXT: andb $85, %al -; GFNIAVX2-NEXT: addb %al, %al -; GFNIAVX2-NEXT: shrb %dil -; GFNIAVX2-NEXT: andb $85, %dil ; GFNIAVX2-NEXT: orb %dil, %al +; GFNIAVX2-NEXT: movl %eax, %ecx +; GFNIAVX2-NEXT: andb $85, %cl +; GFNIAVX2-NEXT: addb %cl, %cl +; GFNIAVX2-NEXT: shrb %al +; GFNIAVX2-NEXT: andb $85, %al +; GFNIAVX2-NEXT: orb %cl, %al ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512F-LABEL: test_bitreverse_i8: @@ -118,13 +118,13 @@ ; GFNIAVX512F-NEXT: shlb $2, %al ; GFNIAVX512F-NEXT: shrb $2, %dil ; GFNIAVX512F-NEXT: andb $51, %dil -; GFNIAVX512F-NEXT: orb %al, %dil -; GFNIAVX512F-NEXT: movl %edi, %eax -; GFNIAVX512F-NEXT: andb $85, %al -; GFNIAVX512F-NEXT: addb %al, %al -; GFNIAVX512F-NEXT: shrb %dil -; GFNIAVX512F-NEXT: andb $85, %dil ; GFNIAVX512F-NEXT: orb %dil, %al +; GFNIAVX512F-NEXT: movl %eax, %ecx +; GFNIAVX512F-NEXT: andb $85, %cl +; GFNIAVX512F-NEXT: addb %cl, %cl +; GFNIAVX512F-NEXT: shrb %al +; GFNIAVX512F-NEXT: andb $85, %al +; GFNIAVX512F-NEXT: orb %cl, %al ; GFNIAVX512F-NEXT: retq ; ; GFNIAVX512BW-LABEL: test_bitreverse_i8: @@ -135,13 +135,13 @@ ; GFNIAVX512BW-NEXT: shlb $2, %al ; GFNIAVX512BW-NEXT: shrb $2, %dil ; GFNIAVX512BW-NEXT: andb $51, %dil -; GFNIAVX512BW-NEXT: orb %al, %dil -; GFNIAVX512BW-NEXT: movl %edi, %eax -; GFNIAVX512BW-NEXT: andb $85, %al -; GFNIAVX512BW-NEXT: addb %al, %al -; GFNIAVX512BW-NEXT: shrb %dil -; GFNIAVX512BW-NEXT: andb $85, %dil ; GFNIAVX512BW-NEXT: orb %dil, %al +; GFNIAVX512BW-NEXT: movl %eax, %ecx +; GFNIAVX512BW-NEXT: andb $85, %cl +; GFNIAVX512BW-NEXT: addb %cl, %cl +; GFNIAVX512BW-NEXT: shrb %al +; GFNIAVX512BW-NEXT: andb $85, %al +; GFNIAVX512BW-NEXT: orb %cl, %al ; GFNIAVX512BW-NEXT: retq %b = call i8 @llvm.bitreverse.i8(i8 %a) ret i8 %b Index: llvm/test/CodeGen/X86/vector-ext-logic.ll =================================================================== --- llvm/test/CodeGen/X86/vector-ext-logic.ll +++ llvm/test/CodeGen/X86/vector-ext-logic.ll @@ -5,13 +5,11 @@ define <8 x i32> @zext_and_v8i32(<8 x i16> %x, <8 x i16> %y) { ; SSE2-LABEL: zext_and_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: retq ; ; AVX2-LABEL: zext_and_v8i32: @@ -28,13 +26,11 @@ define <8 x i32> @zext_or_v8i32(<8 x i16> %x, <8 x i16> %y) { ; SSE2-LABEL: zext_or_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: retq ; ; AVX2-LABEL: zext_or_v8i32: @@ -51,13 +47,11 @@ define <8 x i32> @zext_xor_v8i32(<8 x i16> %x, <8 x i16> %y) { ; SSE2-LABEL: zext_xor_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: retq ; ; AVX2-LABEL: zext_xor_v8i32: @@ -298,14 +292,12 @@ define <8 x i32> @bool_zext_or(<8 x i1> %x, <8 x i1> %y) { ; SSE2-LABEL: bool_zext_or: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: retq ; ; AVX2-LABEL: bool_zext_or: @@ -323,14 +315,12 @@ define <8 x i32> @bool_zext_xor(<8 x i1> %x, <8 x i1> %y) { ; SSE2-LABEL: bool_zext_xor: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: retq ; ; AVX2-LABEL: bool_zext_xor: Index: llvm/test/CodeGen/X86/vector-fshl-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-128.ll +++ llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -35,7 +35,7 @@ ; SSE2-NEXT: psrlq %xmm4, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; SSE2-NEXT: psrlq %xmm4, %xmm1 -; SSE2-NEXT: shufpd{{.*#+}} xmm5 = xmm5[0],xmm1[1] +; SSE2-NEXT: shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1] ; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psllq %xmm2, %xmm1 @@ -731,75 +731,74 @@ ; SSE2-LABEL: var_funnnel_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; SSE2-NEXT: movdqa %xmm2, %xmm7 ; SSE2-NEXT: pandn %xmm8, %xmm7 ; SSE2-NEXT: psllw $5, %xmm7 ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm7, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtb %xmm7, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pandn %xmm1, %xmm4 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm6 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 +; SSE2-NEXT: por %xmm4, %xmm6 ; SSE2-NEXT: paddb %xmm7, %xmm7 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm7, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm5 -; SSE2-NEXT: psrlw $2, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm7, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm6 +; SSE2-NEXT: pand %xmm1, %xmm6 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 +; SSE2-NEXT: por %xmm4, %xmm6 ; SSE2-NEXT: paddb %xmm7, %xmm7 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm7, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm5 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm7, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: psrlw $1, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: pand %xmm8, %xmm2 ; SSE2-NEXT: psllw $5, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: pandn %xmm0, %xmm5 ; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: pandn %xmm0, %xmm5 ; SSE2-NEXT: psllw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_funnnel_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pandn %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pandn %xmm3, %xmm0 ; SSE41-NEXT: psllw $5, %xmm0 ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] @@ -818,27 +817,27 @@ ; SSE41-NEXT: pand %xmm5, %xmm6 ; SSE41-NEXT: paddb %xmm0, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: psllw $5, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: paddb %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm3, %xmm2 +; SSE41-NEXT: paddb %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: psllw $4, %xmm5 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm3 ; SSE41-NEXT: psllw $2, %xmm3 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddb %xmm2, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: por %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: paddb %xmm4, %xmm3 +; SSE41-NEXT: paddb %xmm2, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm4 +; SSE41-NEXT: por %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: var_funnnel_v16i8: @@ -1002,65 +1001,65 @@ ; X86-SSE2-LABEL: var_funnnel_v16i8: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: psrlw $1, %xmm1 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; X86-SSE2-NEXT: pand %xmm5, %xmm1 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 -; X86-SSE2-NEXT: pandn %xmm4, %xmm6 +; X86-SSE2-NEXT: pandn %xmm3, %xmm6 ; X86-SSE2-NEXT: psllw $5, %xmm6 -; X86-SSE2-NEXT: pxor %xmm3, %xmm3 -; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE2-NEXT: pxor %xmm5, %xmm5 +; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm7 ; X86-SSE2-NEXT: pandn %xmm1, %xmm7 ; X86-SSE2-NEXT: psrlw $4, %xmm1 -; X86-SSE2-NEXT: pand %xmm3, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: por %xmm7, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm5 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5 +; X86-SSE2-NEXT: por %xmm7, %xmm5 ; X86-SSE2-NEXT: paddb %xmm6, %xmm6 -; X86-SSE2-NEXT: pxor %xmm3, %xmm3 -; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm7 -; X86-SSE2-NEXT: pandn %xmm1, %xmm7 -; X86-SSE2-NEXT: psrlw $2, %xmm1 -; X86-SSE2-NEXT: pand %xmm3, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: por %xmm7, %xmm1 +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 +; X86-SSE2-NEXT: pandn %xmm5, %xmm7 +; X86-SSE2-NEXT: psrlw $2, %xmm5 +; X86-SSE2-NEXT: pand %xmm1, %xmm5 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5 +; X86-SSE2-NEXT: por %xmm7, %xmm5 ; X86-SSE2-NEXT: paddb %xmm6, %xmm6 -; X86-SSE2-NEXT: pxor %xmm3, %xmm3 -; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm6 -; X86-SSE2-NEXT: pand %xmm5, %xmm3 -; X86-SSE2-NEXT: pandn %xmm1, %xmm6 -; X86-SSE2-NEXT: psrlw $1, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm3 -; X86-SSE2-NEXT: por %xmm6, %xmm3 -; X86-SSE2-NEXT: pand %xmm4, %xmm2 -; X86-SSE2-NEXT: psllw $5, %xmm2 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: pandn %xmm5, %xmm6 +; X86-SSE2-NEXT: psrlw $1, %xmm5 +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: por %xmm6, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: psllw $5, %xmm2 +; X86-SSE2-NEXT: pxor %xmm3, %xmm3 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE2-NEXT: pandn %xmm0, %xmm4 ; X86-SSE2-NEXT: psllw $4, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: por %xmm4, %xmm0 ; X86-SSE2-NEXT: paddb %xmm2, %xmm2 -; X86-SSE2-NEXT: pxor %xmm1, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pxor %xmm3, %xmm3 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE2-NEXT: pandn %xmm0, %xmm4 ; X86-SSE2-NEXT: psllw $2, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: por %xmm4, %xmm0 -; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: pxor %xmm3, %xmm3 ; X86-SSE2-NEXT: paddb %xmm2, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm3, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 ; X86-SSE2-NEXT: paddb %xmm0, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 ; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: retl %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) Index: llvm/test/CodeGen/X86/vector-fshr-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-128.ll +++ llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -426,47 +426,47 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind { ; SSE2-LABEL: var_funnnel_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: psllw $12, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psraw $15, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: psllw $12, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: psraw $15, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: pandn %xmm1, %xmm5 ; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: paddw %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psraw $15, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm5 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: paddw %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psraw $15, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm5 -; SSE2-NEXT: psrlw $2, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: paddw %xmm3, %xmm3 -; SSE2-NEXT: psraw $15, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: psrlw $4, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: psrlw $2, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: paddw %xmm4, %xmm4 +; SSE2-NEXT: psraw $15, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $23, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $23, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] -; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: paddd %xmm5, %xmm4 +; SSE2-NEXT: cvttps2dq %xmm4, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $23, %xmm2 ; SSE2-NEXT: paddd %xmm5, %xmm2 @@ -474,10 +474,10 @@ ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; SSE2-NEXT: psllw $1, %xmm0 ; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; @@ -671,47 +671,47 @@ ; ; X86-SSE2-LABEL: var_funnnel_v8i16: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE2-NEXT: psllw $12, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE2-NEXT: psraw $15, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: psllw $12, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm3 +; X86-SSE2-NEXT: psraw $15, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 ; X86-SSE2-NEXT: pandn %xmm1, %xmm5 ; X86-SSE2-NEXT: psrlw $8, %xmm1 -; X86-SSE2-NEXT: pand %xmm4, %xmm1 -; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: paddw %xmm3, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE2-NEXT: psraw $15, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 -; X86-SSE2-NEXT: pandn %xmm1, %xmm5 -; X86-SSE2-NEXT: psrlw $4, %xmm1 -; X86-SSE2-NEXT: pand %xmm4, %xmm1 -; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: paddw %xmm3, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE2-NEXT: psraw $15, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 -; X86-SSE2-NEXT: pandn %xmm1, %xmm5 -; X86-SSE2-NEXT: psrlw $2, %xmm1 -; X86-SSE2-NEXT: pand %xmm4, %xmm1 -; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: paddw %xmm3, %xmm3 -; X86-SSE2-NEXT: psraw $15, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE2-NEXT: pandn %xmm1, %xmm4 -; X86-SSE2-NEXT: psrlw $1, %xmm1 ; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm5, %xmm3 +; X86-SSE2-NEXT: paddw %xmm4, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: psraw $15, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE2-NEXT: pandn %xmm3, %xmm5 +; X86-SSE2-NEXT: psrlw $4, %xmm3 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm5, %xmm3 +; X86-SSE2-NEXT: paddw %xmm4, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: psraw $15, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE2-NEXT: pandn %xmm3, %xmm5 +; X86-SSE2-NEXT: psrlw $2, %xmm3 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm5, %xmm3 +; X86-SSE2-NEXT: paddw %xmm4, %xmm4 +; X86-SSE2-NEXT: psraw $15, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm1 +; X86-SSE2-NEXT: psrlw $1, %xmm3 +; X86-SSE2-NEXT: pand %xmm4, %xmm3 ; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: pslld $23, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: pslld $23, %xmm4 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] -; X86-SSE2-NEXT: paddd %xmm5, %xmm1 -; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE2-NEXT: paddd %xmm5, %xmm4 +; X86-SSE2-NEXT: cvttps2dq %xmm4, %xmm4 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; X86-SSE2-NEXT: pslld $23, %xmm2 ; X86-SSE2-NEXT: paddd %xmm5, %xmm2 @@ -719,10 +719,10 @@ ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] ; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; X86-SSE2-NEXT: psllw $1, %xmm0 ; X86-SSE2-NEXT: pmullw %xmm2, %xmm0 -; X86-SSE2-NEXT: por %xmm4, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm3, %xmm0 ; X86-SSE2-NEXT: retl %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) @@ -742,24 +742,24 @@ ; SSE2-NEXT: movdqa %xmm4, %xmm7 ; SSE2-NEXT: pandn %xmm1, %xmm7 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; SSE2-NEXT: por %xmm7, %xmm4 ; SSE2-NEXT: paddb %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: pandn %xmm1, %xmm7 -; SSE2-NEXT: psrlw $2, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: psrlw $2, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; SSE2-NEXT: por %xmm7, %xmm4 ; SSE2-NEXT: paddb %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pandn %xmm4, %xmm6 +; SSE2-NEXT: psrlw $1, %xmm4 ; SSE2-NEXT: pand %xmm1, %xmm4 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; SSE2-NEXT: por %xmm6, %xmm4 @@ -1003,24 +1003,24 @@ ; X86-SSE2-NEXT: movdqa %xmm4, %xmm7 ; X86-SSE2-NEXT: pandn %xmm1, %xmm7 ; X86-SSE2-NEXT: psrlw $4, %xmm1 -; X86-SSE2-NEXT: pand %xmm4, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: por %xmm7, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm4 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 +; X86-SSE2-NEXT: por %xmm7, %xmm4 ; X86-SSE2-NEXT: paddb %xmm6, %xmm6 -; X86-SSE2-NEXT: pxor %xmm4, %xmm4 -; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm7 -; X86-SSE2-NEXT: pandn %xmm1, %xmm7 -; X86-SSE2-NEXT: psrlw $2, %xmm1 -; X86-SSE2-NEXT: pand %xmm4, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: por %xmm7, %xmm1 +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 +; X86-SSE2-NEXT: pandn %xmm4, %xmm7 +; X86-SSE2-NEXT: psrlw $2, %xmm4 +; X86-SSE2-NEXT: pand %xmm1, %xmm4 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 +; X86-SSE2-NEXT: por %xmm7, %xmm4 ; X86-SSE2-NEXT: paddb %xmm6, %xmm6 -; X86-SSE2-NEXT: pxor %xmm4, %xmm4 -; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm6 -; X86-SSE2-NEXT: pandn %xmm1, %xmm6 -; X86-SSE2-NEXT: psrlw $1, %xmm1 +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: pandn %xmm4, %xmm6 +; X86-SSE2-NEXT: psrlw $1, %xmm4 ; X86-SSE2-NEXT: pand %xmm1, %xmm4 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 ; X86-SSE2-NEXT: por %xmm6, %xmm4 Index: llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -78,41 +78,39 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind { ; SSE2-LABEL: test_div7_4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm3 +; SSE2-NEXT: pmuludq %xmm1, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psubd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $31, %xmm0 -; SSE2-NEXT: psrad $2, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psubd %xmm3, %xmm2 +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $31, %xmm1 +; SSE2-NEXT: psrad $2, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_div7_4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; SSE41-NEXT: pmuldq %xmm1, %xmm2 -; SSE41-NEXT: pmuldq %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $31, %xmm0 -; SSE41-NEXT: psrad $2, %xmm1 -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; SSE41-NEXT: pmuldq %xmm2, %xmm1 +; SSE41-NEXT: pmuldq %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; SSE41-NEXT: paddd %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $31, %xmm1 +; SSE41-NEXT: psrad $2, %xmm0 +; SSE41-NEXT: paddd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_div7_4i32: @@ -170,17 +168,16 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { ; SSE-LABEL: test_div7_16i8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632] ; SSE-NEXT: pmulhw %xmm3, %xmm2 ; SSE-NEXT: psrlw $8, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: pmulhw %xmm3, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: pmulhw %xmm3, %xmm1 +; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: packuswb %xmm2, %xmm1 ; SSE-NEXT: paddb %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psrlw $2, %xmm1 Index: llvm/test/CodeGen/X86/vector-lzcnt-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-lzcnt-128.ll +++ llvm/test/CodeGen/X86/vector-lzcnt-128.ll @@ -18,80 +18,80 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlq $1, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlq $2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlq $2, %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlq $4, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlq $8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlq $8, %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlq $16, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlq $32, %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: psubb %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: paddb %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrlw $4, %xmm2 -; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv2i64: ; SSE3: # %bb.0: ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlq $1, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlq $2, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlq $2, %xmm1 ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlq $4, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlq $8, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlq $8, %xmm1 ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlq $16, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlq $32, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlq $32, %xmm1 ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlw $1, %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: psubb %xmm0, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm1 -; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: paddb %xmm2, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: psrlw $4, %xmm2 -; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm2, %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: psadbw %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv2i64: @@ -268,80 +268,80 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlq $1, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlq $2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlq $2, %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlq $4, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlq $8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlq $8, %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlq $16, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlq $32, %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: psubb %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: paddb %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrlw $4, %xmm2 -; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv2i64u: ; SSE3: # %bb.0: ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlq $1, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlq $2, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlq $2, %xmm1 ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlq $4, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlq $8, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlq $8, %xmm1 ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlq $16, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlq $32, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlq $32, %xmm1 ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlw $1, %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: psubb %xmm0, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm1 -; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: paddb %xmm2, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: psrlw $4, %xmm2 -; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm2, %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: psadbw %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv2i64u: @@ -518,34 +518,34 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $2, %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $4, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $8, %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: psubb %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psrlw $2, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 @@ -560,34 +560,34 @@ ; SSE3: # %bb.0: ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrld $1, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrld $2, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrld $2, %xmm1 ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrld $4, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrld $8, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrld $8, %xmm1 ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrld $16, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE3-NEXT: pxor %xmm1, %xmm2 -; SSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE3-NEXT: psrlw $1, %xmm0 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: psubb %xmm0, %xmm2 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm2, %xmm1 -; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: psrlw $2, %xmm2 -; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE3-NEXT: pxor %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: pxor %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 @@ -744,34 +744,34 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $2, %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $4, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $8, %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: psubb %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psrlw $2, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 @@ -786,34 +786,34 @@ ; SSE3: # %bb.0: ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrld $1, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrld $2, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrld $2, %xmm1 ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrld $4, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrld $8, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrld $8, %xmm1 ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrld $16, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE3-NEXT: pxor %xmm1, %xmm2 -; SSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE3-NEXT: psrlw $1, %xmm0 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: psubb %xmm0, %xmm2 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm2, %xmm1 -; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: psrlw $2, %xmm2 -; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE3-NEXT: pxor %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: pxor %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 @@ -970,35 +970,35 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $2, %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: psubb %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: paddb %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrlw $4, %xmm2 -; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: retq ; @@ -1006,35 +1006,35 @@ ; SSE3: # %bb.0: ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $2, %xmm1 ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlw $8, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $8, %xmm1 ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlw $1, %xmm0 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: psubb %xmm0, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm1 -; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: paddb %xmm2, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: psrlw $4, %xmm2 -; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllw $8, %xmm1 +; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: retq ; @@ -1160,35 +1160,35 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $2, %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: psubb %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: paddb %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrlw $4, %xmm2 -; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: retq ; @@ -1196,35 +1196,35 @@ ; SSE3: # %bb.0: ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $2, %xmm1 ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlw $8, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $8, %xmm1 ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlw $1, %xmm0 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: psubb %xmm0, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm1 -; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: paddb %xmm2, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: psrlw $4, %xmm2 -; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllw $8, %xmm1 +; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: retq ; @@ -1351,31 +1351,31 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $2, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: psubb %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psrlw $2, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: paddb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: retq ; @@ -1384,31 +1384,31 @@ ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $2, %xmm1 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 ; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE3-NEXT: pand %xmm2, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE3-NEXT: pxor %xmm1, %xmm3 -; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlw $1, %xmm0 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: psubb %xmm0, %xmm3 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm3, %xmm1 -; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: psrlw $2, %xmm3 -; SSE3-NEXT: pand %xmm0, %xmm3 -; SSE3-NEXT: paddb %xmm1, %xmm3 -; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE3-NEXT: pxor %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: movdqa %xmm0, %xmm3 +; SSE3-NEXT: pand %xmm1, %xmm3 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm3, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: retq ; @@ -1501,31 +1501,31 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $2, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: psubb %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psrlw $2, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: paddb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: retq ; @@ -1534,31 +1534,31 @@ ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $2, %xmm1 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 ; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE3-NEXT: pand %xmm2, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE3-NEXT: pxor %xmm1, %xmm3 -; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlw $1, %xmm0 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: psubb %xmm0, %xmm3 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm3, %xmm1 -; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: psrlw $2, %xmm3 -; SSE3-NEXT: pand %xmm0, %xmm3 -; SSE3-NEXT: paddb %xmm1, %xmm3 -; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE3-NEXT: pxor %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: movdqa %xmm0, %xmm3 +; SSE3-NEXT: pand %xmm1, %xmm3 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm3, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-lzcnt-sub128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-lzcnt-sub128.ll +++ llvm/test/CodeGen/X86/vector-lzcnt-sub128.ll @@ -8,34 +8,34 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrld $1, %xmm1 -; CHECK-NEXT: por %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrld $2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $2, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrld $4, %xmm1 -; CHECK-NEXT: por %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrld $8, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $8, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrld $16, %xmm1 -; CHECK-NEXT: por %xmm0, %xmm1 -; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-NEXT: pxor %xmm1, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlw $1, %xmm0 -; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: psubb %xmm0, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; CHECK-NEXT: movdqa %xmm2, %xmm1 -; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: psrlw $2, %xmm2 -; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: paddb %xmm1, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlw $4, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlw $1, %xmm1 +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: psubb %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pand %xmm1, %xmm2 +; CHECK-NEXT: psrlw $2, %xmm0 +; CHECK-NEXT: pand %xmm1, %xmm0 ; CHECK-NEXT: paddb %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: paddb %xmm1, %xmm0 ; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: pxor %xmm1, %xmm1 ; CHECK-NEXT: movdqa %xmm0, %xmm2 @@ -69,16 +69,15 @@ ; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $4, %xmm1 -; CHECK-NEXT: paddb %xmm0, %xmm1 -; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; CHECK-NEXT: psadbw %xmm0, %xmm2 -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: psadbw %xmm0, %xmm1 -; CHECK-NEXT: packuswb %xmm2, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: paddb %xmm1, %xmm0 +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; CHECK-NEXT: psadbw %xmm1, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: psadbw %xmm1, %xmm0 +; CHECK-NEXT: packuswb %xmm2, %xmm0 ; CHECK-NEXT: retq %v2 = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %v1, i1 true) ret <2 x i32> %v2 Index: llvm/test/CodeGen/X86/vector-narrow-binop.ll =================================================================== --- llvm/test/CodeGen/X86/vector-narrow-binop.ll +++ llvm/test/CodeGen/X86/vector-narrow-binop.ll @@ -153,9 +153,9 @@ ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: mulpd %xmm2, %xmm2 ; SSE-NEXT: mulpd %xmm1, %xmm1 -; SSE-NEXT: addpd %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: fmul_v2f64: Index: llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll =================================================================== --- llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll +++ llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll @@ -1669,10 +1669,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1692,10 +1692,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllw $8, %xmm1 ; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1988,10 +1988,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2011,10 +2011,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllw $8, %xmm1 ; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2307,10 +2307,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2330,10 +2330,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllw $8, %xmm1 ; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2626,10 +2626,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2649,10 +2649,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllw $8, %xmm1 ; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2945,10 +2945,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2968,10 +2968,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllw $8, %xmm1 ; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -3264,10 +3264,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -3287,10 +3287,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllw $8, %xmm1 ; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -3583,10 +3583,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -3606,10 +3606,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllw $8, %xmm1 ; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -3902,10 +3902,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -3925,10 +3925,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllw $8, %xmm1 ; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -4221,10 +4221,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -4244,10 +4244,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllw $8, %xmm1 ; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -4540,10 +4540,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -4563,10 +4563,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllw $8, %xmm1 ; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -4859,10 +4859,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -4882,10 +4882,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllw $8, %xmm1 ; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -5178,10 +5178,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -5201,10 +5201,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllw $8, %xmm1 ; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -5497,10 +5497,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -5520,10 +5520,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllw $8, %xmm1 ; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -5960,17 +5960,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_2_v4i32: @@ -5987,17 +5986,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_2_v4i32: @@ -6340,17 +6338,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_3_v4i32: @@ -6367,17 +6364,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_3_v4i32: @@ -6720,17 +6716,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_4_v4i32: @@ -6747,17 +6742,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_4_v4i32: @@ -7100,17 +7094,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_5_v4i32: @@ -7127,17 +7120,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_5_v4i32: @@ -7480,17 +7472,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_6_v4i32: @@ -7507,17 +7498,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_6_v4i32: @@ -7860,17 +7850,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_7_v4i32: @@ -7887,17 +7876,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_7_v4i32: @@ -8240,17 +8228,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_8_v4i32: @@ -8267,17 +8254,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_8_v4i32: @@ -8620,17 +8606,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_9_v4i32: @@ -8647,17 +8632,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_9_v4i32: @@ -9000,17 +8984,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_10_v4i32: @@ -9027,17 +9010,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_10_v4i32: @@ -9380,17 +9362,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_11_v4i32: @@ -9407,17 +9388,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_11_v4i32: @@ -9760,17 +9740,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_12_v4i32: @@ -9787,17 +9766,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_12_v4i32: @@ -10140,17 +10118,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_13_v4i32: @@ -10167,17 +10144,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_13_v4i32: @@ -10520,17 +10496,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_14_v4i32: @@ -10547,17 +10522,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_14_v4i32: @@ -10900,17 +10874,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_15_v4i32: @@ -10927,17 +10900,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_15_v4i32: @@ -11280,17 +11252,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_16_v4i32: @@ -11307,17 +11278,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_16_v4i32: @@ -11660,17 +11630,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_17_v4i32: @@ -11687,17 +11656,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_17_v4i32: @@ -12040,17 +12008,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_18_v4i32: @@ -12067,17 +12034,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_18_v4i32: @@ -12420,17 +12386,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_19_v4i32: @@ -12447,17 +12412,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_19_v4i32: @@ -12800,17 +12764,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_20_v4i32: @@ -12827,17 +12790,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_20_v4i32: @@ -13180,17 +13142,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_21_v4i32: @@ -13207,17 +13168,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_21_v4i32: @@ -13560,17 +13520,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_22_v4i32: @@ -13587,17 +13546,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_22_v4i32: @@ -13940,17 +13898,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_23_v4i32: @@ -13967,17 +13924,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_23_v4i32: @@ -14320,17 +14276,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_24_v4i32: @@ -14347,17 +14302,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_24_v4i32: @@ -14700,17 +14654,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_25_v4i32: @@ -14727,17 +14680,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_25_v4i32: @@ -15080,17 +15032,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_26_v4i32: @@ -15107,17 +15058,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_26_v4i32: @@ -15460,17 +15410,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_27_v4i32: @@ -15487,17 +15436,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_27_v4i32: @@ -15840,17 +15788,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_28_v4i32: @@ -15867,17 +15814,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_28_v4i32: @@ -16220,17 +16166,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_29_v4i32: @@ -16247,17 +16192,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_29_v4i32: @@ -16600,17 +16544,16 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_30_v4i32: @@ -16627,17 +16570,16 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_30_v4i32: @@ -16971,11 +16913,11 @@ ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: paddq %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -16984,11 +16926,11 @@ ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: paddq %xmm1, %xmm2 -; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: pxor %xmm3, %xmm3 -; SSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] -; SSE3-NEXT: pand %xmm3, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: pxor %xmm2, %xmm2 +; SSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2] +; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: pxor %xmm1, %xmm0 ; SSE3-NEXT: retq ; @@ -16997,11 +16939,11 @@ ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: paddq %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] -; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2] +; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSSE3-NEXT: retq ; @@ -17084,33 +17026,33 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: paddq %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ult_2_v2i64: ; SSE3: # %bb.0: ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE3-NEXT: paddq %xmm0, %xmm1 -; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: pxor %xmm2, %xmm2 -; SSE3-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ult_2_v2i64: ; SSSE3: # %bb.0: ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: paddq %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ult_2_v2i64: Index: llvm/test/CodeGen/X86/vector-popcnt-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-popcnt-128.ll +++ llvm/test/CodeGen/X86/vector-popcnt-128.ll @@ -158,16 +158,15 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv4i32: @@ -184,16 +183,15 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv4i32: @@ -329,10 +327,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: retq @@ -351,10 +349,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllw $8, %xmm1 ; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: retq @@ -804,52 +802,55 @@ ; SSE2-LABEL: ne_1_v2i64: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: paddq %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: paddq %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,0,3,2] -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ne_1_v2i64: ; SSE3: # %bb.0: ; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE3-NEXT: pand %xmm2, %xmm3 ; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE3-NEXT: movdqa %xmm0, %xmm3 -; SSE3-NEXT: paddq %xmm2, %xmm3 -; SSE3-NEXT: pand %xmm0, %xmm3 +; SSE3-NEXT: movdqa %xmm0, %xmm4 +; SSE3-NEXT: paddq %xmm2, %xmm4 +; SSE3-NEXT: pand %xmm4, %xmm0 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,0,3,2] -; SSE3-NEXT: pand %xmm0, %xmm4 -; SSE3-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] -; SSE3-NEXT: pand %xmm3, %xmm0 +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: pxor %xmm2, %xmm0 -; SSE3-NEXT: por %xmm4, %xmm0 +; SSE3-NEXT: por %xmm3, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ne_1_v2i64: ; SSSE3: # %bb.0: ; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSSE3-NEXT: pand %xmm2, %xmm3 ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: paddq %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: paddq %xmm2, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm0 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,0,3,2] -; SSSE3-NEXT: pand %xmm0, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] -; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: por %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ne_1_v2i64: Index: llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -321,9 +321,9 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -331,8 +331,8 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -375,9 +375,9 @@ ; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -386,8 +386,8 @@ ; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -444,9 +444,9 @@ ; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -457,8 +457,8 @@ ; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -562,9 +562,9 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -572,8 +572,8 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -616,9 +616,9 @@ ; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -627,8 +627,8 @@ ; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -685,9 +685,9 @@ ; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -698,8 +698,8 @@ ; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -1126,9 +1126,9 @@ ; SSE-NEXT: addpd %xmm7, %xmm3 ; SSE-NEXT: addpd %xmm5, %xmm1 ; SSE-NEXT: addpd %xmm3, %xmm1 -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -1330,9 +1330,9 @@ ; SSE-NEXT: addpd %xmm7, %xmm3 ; SSE-NEXT: addpd %xmm5, %xmm1 ; SSE-NEXT: addpd %xmm3, %xmm1 -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll +++ llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll @@ -233,9 +233,9 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: mulps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE2-NEXT: mulps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -243,8 +243,8 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: mulps %xmm0, %xmm1 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: mulps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -273,9 +273,9 @@ ; SSE2-NEXT: mulps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: mulps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE2-NEXT: mulps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -284,8 +284,8 @@ ; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: mulps %xmm0, %xmm1 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: mulps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -322,9 +322,9 @@ ; SSE2-NEXT: mulps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: mulps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE2-NEXT: mulps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -335,8 +335,8 @@ ; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: mulps %xmm0, %xmm1 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: mulps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -406,9 +406,9 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: mulps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE2-NEXT: mulps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -416,8 +416,8 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: mulps %xmm0, %xmm1 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: mulps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -446,9 +446,9 @@ ; SSE2-NEXT: mulps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: mulps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE2-NEXT: mulps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -457,8 +457,8 @@ ; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: mulps %xmm0, %xmm1 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: mulps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -495,9 +495,9 @@ ; SSE2-NEXT: mulps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: mulps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE2-NEXT: mulps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -508,8 +508,8 @@ ; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: mulps %xmm0, %xmm1 -; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: mulps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -788,9 +788,9 @@ ; SSE-NEXT: mulpd %xmm7, %xmm3 ; SSE-NEXT: mulpd %xmm5, %xmm1 ; SSE-NEXT: mulpd %xmm3, %xmm1 -; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -922,9 +922,9 @@ ; SSE-NEXT: mulpd %xmm7, %xmm3 ; SSE-NEXT: mulpd %xmm5, %xmm1 ; SSE-NEXT: mulpd %xmm3, %xmm1 -; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-shift-ashr-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -257,33 +257,31 @@ ; ; SSE41-LABEL: var_shift_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: psllw $12, %xmm0 -; SSE41-NEXT: psllw $4, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddw %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: psraw $8, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $4, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $2, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: psraw $8, %xmm3 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psraw $4, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psraw $2, %xmm3 +; SSE41-NEXT: paddw %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psraw $1, %xmm3 +; SSE41-NEXT: paddw %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v8i16: Index: llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -161,33 +161,31 @@ ; ; SSE41-LABEL: var_shift_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: psllw $12, %xmm0 -; SSE41-NEXT: psllw $4, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddw %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: psraw $8, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $4, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $2, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: psraw $8, %xmm3 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psraw $4, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psraw $2, %xmm3 +; SSE41-NEXT: paddw %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psraw $1, %xmm3 +; SSE41-NEXT: paddw %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v4i16: @@ -334,33 +332,31 @@ ; ; SSE41-LABEL: var_shift_v2i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: psllw $12, %xmm0 -; SSE41-NEXT: psllw $4, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddw %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: psraw $8, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $4, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $2, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: psraw $8, %xmm3 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psraw $4, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psraw $2, %xmm3 +; SSE41-NEXT: paddw %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psraw $1, %xmm3 +; SSE41-NEXT: paddw %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v2i16: Index: llvm/test/CodeGen/X86/vector-shift-lshr-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -227,33 +227,31 @@ ; ; SSE41-LABEL: var_shift_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: psllw $12, %xmm0 -; SSE41-NEXT: psllw $4, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddw %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: psrlw $8, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $4, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $2, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $1, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $2, %xmm3 +; SSE41-NEXT: paddw %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $1, %xmm3 +; SSE41-NEXT: paddw %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v8i16: Index: llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -161,33 +161,31 @@ ; ; SSE41-LABEL: var_shift_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: psllw $12, %xmm0 -; SSE41-NEXT: psllw $4, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddw %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: psrlw $8, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $4, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $2, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $1, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $2, %xmm3 +; SSE41-NEXT: paddw %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $1, %xmm3 +; SSE41-NEXT: paddw %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v4i16: @@ -334,33 +332,31 @@ ; ; SSE41-LABEL: var_shift_v2i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: psllw $12, %xmm0 -; SSE41-NEXT: psllw $4, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddw %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: psrlw $8, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $4, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $2, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $1, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm3 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $4, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $2, %xmm3 +; SSE41-NEXT: paddw %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrlw $1, %xmm3 +; SSE41-NEXT: paddw %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_shift_v2i16: Index: llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -23,18 +23,18 @@ ; X86: # %bb.0: ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vpermi2w %zmm0, %zmm0, %zmm1 {%k1} {z} -; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] -; X86-NEXT: vpermi2w %zmm1, %zmm1, %zmm0 {%k1} {z} +; X86-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z} +; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] +; X86-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_32i16_identity_mask: ; X64: # %bb.0: ; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-NEXT: kmovd %edi, %k1 -; X64-NEXT: vpermi2w %zmm0, %zmm0, %zmm1 {%k1} {z} -; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] -; X64-NEXT: vpermi2w %zmm1, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] +; X64-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-NEXT: retq %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> , <32 x i16> %x0, <32 x i16> %x1, i32 %m) %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> , <32 x i16> %res0, <32 x i16> %res0, i32 %m) Index: llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -18,18 +18,18 @@ ; X86: # %bb.0: ; X86-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vpermi2w %ymm0, %ymm0, %ymm1 {%k1} {z} -; X86-NEXT: vmovdqa {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; X86-NEXT: vpermi2w %ymm1, %ymm1, %ymm0 {%k1} {z} +; X86-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z} +; X86-NEXT: vmovdqa {{.*#+}} ymm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X86-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z} ; X86-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_16i16_identity_mask: ; X64: # %bb.0: ; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-NEXT: kmovd %edi, %k1 -; X64-NEXT: vpermi2w %ymm0, %ymm0, %ymm1 {%k1} {z} -; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; X64-NEXT: vpermi2w %ymm1, %ymm1, %ymm0 {%k1} {z} +; X64-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z} +; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X64-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z} ; X64-NEXT: retq %res0 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> , <16 x i16> %x0, <16 x i16> %x1, i16 %m) %res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> , <16 x i16> %res0, <16 x i16> %res0, i16 %m) Index: llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll +++ llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll @@ -154,9 +154,9 @@ ; X86-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-AVX512F-NEXT: kmovw %eax, %k1 -; X86-AVX512F-NEXT: vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z} -; X86-AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] -; X86-AVX512F-NEXT: vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z} +; X86-AVX512F-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z} +; X86-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] +; X86-AVX512F-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z} ; X86-AVX512F-NEXT: retl ; ; X86-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask: @@ -164,27 +164,27 @@ ; X86-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-AVX512BW-NEXT: kmovd %eax, %k1 -; X86-AVX512BW-NEXT: vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z} -; X86-AVX512BW-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] -; X86-AVX512BW-NEXT: vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z} +; X86-AVX512BW-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z} +; X86-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] +; X86-AVX512BW-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z} ; X86-AVX512BW-NEXT: retl ; ; X64-AVX512F-LABEL: combine_vpermt2var_8f64_identity_mask: ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] ; X64-AVX512F-NEXT: kmovw %edi, %k1 -; X64-AVX512F-NEXT: vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z} -; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] -; X64-AVX512F-NEXT: vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z} +; X64-AVX512F-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8] +; X64-AVX512F-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask: ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] ; X64-AVX512BW-NEXT: kmovd %edi, %k1 -; X64-AVX512BW-NEXT: vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z} -; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] -; X64-AVX512BW-NEXT: vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z} +; X64-AVX512BW-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8] +; X64-AVX512BW-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512BW-NEXT: retq %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> , <8 x double> %x0, <8 x double> %x1, i8 %m) %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> , <8 x double> %res0, <8 x double> %res0, i8 %m) @@ -258,9 +258,9 @@ ; X86-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-AVX512F-NEXT: kmovw %eax, %k1 -; X86-AVX512F-NEXT: vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z} -; X86-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] -; X86-AVX512F-NEXT: vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z} +; X86-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} +; X86-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] +; X86-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} ; X86-AVX512F-NEXT: retl ; ; X86-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask: @@ -268,27 +268,27 @@ ; X86-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-AVX512BW-NEXT: kmovd %eax, %k1 -; X86-AVX512BW-NEXT: vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z} -; X86-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] -; X86-AVX512BW-NEXT: vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z} +; X86-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} +; X86-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] +; X86-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} ; X86-AVX512BW-NEXT: retl ; ; X64-AVX512F-LABEL: combine_vpermt2var_8i64_identity_mask: ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] ; X64-AVX512F-NEXT: kmovw %edi, %k1 -; X64-AVX512F-NEXT: vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z} -; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] -; X64-AVX512F-NEXT: vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z} +; X64-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8] +; X64-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask: ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] ; X64-AVX512BW-NEXT: kmovd %edi, %k1 -; X64-AVX512BW-NEXT: vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z} -; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] -; X64-AVX512BW-NEXT: vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z} +; X64-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8] +; X64-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512BW-NEXT: retq %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %x0, <8 x i64> %x1, i8 %m) %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %res0, <8 x i64> %res0, i8 %m) @@ -308,27 +308,27 @@ ; X86: # %bb.0: ; X86-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z} -; X86-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; X86-NEXT: vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z} +; X86-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z} +; X86-NEXT: vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X86-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl ; ; X64-AVX512F-LABEL: combine_vpermt2var_16f32_identity_mask: ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-AVX512F-NEXT: kmovw %edi, %k1 -; X64-AVX512F-NEXT: vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z} -; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; X64-AVX512F-NEXT: vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z} +; X64-AVX512F-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X64-AVX512F-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_identity_mask: ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-AVX512BW-NEXT: kmovd %edi, %k1 -; X64-AVX512BW-NEXT: vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z} -; X64-AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; X64-AVX512BW-NEXT: vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z} +; X64-AVX512BW-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X64-AVX512BW-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512BW-NEXT: retq %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m) %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %res0, <16 x float> %res0, i16 %m) @@ -597,27 +597,27 @@ ; X86: # %bb.0: ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z} -; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; X86-NEXT: vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z} +; X86-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z} +; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X86-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl ; ; X64-AVX512F-LABEL: combine_vpermt2var_16i32_identity_mask: ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-AVX512F-NEXT: kmovw %edi, %k1 -; X64-AVX512F-NEXT: vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z} -; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; X64-AVX512F-NEXT: vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z} +; X64-AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X64-AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: combine_vpermt2var_16i32_identity_mask: ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-AVX512BW-NEXT: kmovd %edi, %k1 -; X64-AVX512BW-NEXT: vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z} -; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; X64-AVX512BW-NEXT: vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z} +; X64-AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X64-AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512BW-NEXT: retq %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %x0, <16 x i32> %x1, i16 %m) %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %res0, <16 x i32> %res0, i16 %m) Index: llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll +++ llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll @@ -31,18 +31,18 @@ ; X86: # %bb.0: ; X86-NEXT: vmovdqa {{.*#+}} xmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vpermi2b %xmm0, %xmm0, %xmm1 {%k1} {z} -; X86-NEXT: vmovdqa {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; X86-NEXT: vpermi2b %xmm1, %xmm1, %xmm0 {%k1} {z} +; X86-NEXT: vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z} +; X86-NEXT: vmovdqa {{.*#+}} xmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X86-NEXT: vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z} ; X86-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_16i8_identity_mask: ; X64: # %bb.0: ; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-NEXT: kmovd %edi, %k1 -; X64-NEXT: vpermi2b %xmm0, %xmm0, %xmm1 {%k1} {z} -; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] -; X64-NEXT: vpermi2b %xmm1, %xmm1, %xmm0 {%k1} {z} +; X64-NEXT: vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z} +; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X64-NEXT: vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z} ; X64-NEXT: retq %res0 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> , <16 x i8> %x0, <16 x i8> %x1, i16 %m) %res1 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> , <16 x i8> %res0, <16 x i8> %res0, i16 %m) Index: llvm/test/CodeGen/X86/vector-trunc-packus.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -321,110 +321,110 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; SSE2-LABEL: trunc_packus_v4i64_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] ; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_packus_v4i64_v4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] ; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: pand %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm6 ; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pandn %xmm3, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm4 +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm3, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm5, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 ; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_packus_v4i64_v4i32: @@ -2312,10 +2312,10 @@ ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: pandn %xmm1, %xmm2 -; SSSE3-NEXT: por %xmm0, %xmm2 +; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; SSSE3-NEXT: retq @@ -2448,19 +2448,19 @@ ; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: packssdw %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_packus_v8i32_v8i16: @@ -2475,18 +2475,18 @@ ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: pandn %xmm2, %xmm1 -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_packus_v8i32_v8i16: @@ -2725,18 +2725,18 @@ ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 @@ -2759,18 +2759,18 @@ ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: por %xmm3, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: retq ; @@ -3043,37 +3043,37 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: pand %xmm8, %xmm3 ; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: packuswb %xmm3, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 @@ -3109,17 +3109,17 @@ ; SSSE3-NEXT: por %xmm1, %xmm4 ; SSSE3-NEXT: pand %xmm4, %xmm0 ; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: por %xmm4, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm4 ; SSSE3-NEXT: pand %xmm4, %xmm0 ; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 @@ -5097,10 +5097,10 @@ ; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 @@ -5114,10 +5114,10 @@ ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: pandn %xmm1, %xmm2 -; SSSE3-NEXT: por %xmm0, %xmm2 +; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: retq @@ -5126,10 +5126,9 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pmaxsd %xmm0, %xmm1 -; SSE41-NEXT: packusdw %xmm1, %xmm1 -; SSE41-NEXT: packuswb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmaxsd %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm0, %xmm0 +; SSE41-NEXT: packuswb %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v4i32_v4i8: Index: llvm/test/CodeGen/X86/vector-trunc-ssat.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -313,122 +313,122 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; SSE2-LABEL: trunc_ssat_v4i64_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] ; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm5 ; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: por %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_ssat_v4i64_v4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] ; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: pand %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm6 ; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pandn %xmm3, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm4 +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm3, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320] -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744069414584320,18446744069414584320] +; SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: por %xmm3, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm5 ; SSSE3-NEXT: pandn %xmm1, %xmm6 -; SSSE3-NEXT: por %xmm4, %xmm6 -; SSSE3-NEXT: pxor %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm3 -; SSSE3-NEXT: pandn %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm1, %xmm3 ; SSSE3-NEXT: por %xmm3, %xmm0 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] ; SSSE3-NEXT: retq @@ -1376,61 +1376,61 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) { ; SSE2-LABEL: trunc_ssat_v4i64_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32767,32767] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147516415,2147516415] ; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [18446744071562035200,18446744071562035200] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm5 ; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: por %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: packssdw %xmm6, %xmm0 ; SSE2-NEXT: packssdw %xmm0, %xmm0 @@ -1438,61 +1438,61 @@ ; ; SSSE3-LABEL: trunc_ssat_v4i64_v4i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [32767,32767] ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147516415,2147516415] ; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: pand %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm6 ; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pandn %xmm3, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm4 +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm3, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [18446744071562035200,18446744071562035200] -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562035200,18446744071562035200] +; SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: por %xmm3, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm5 ; SSSE3-NEXT: pandn %xmm1, %xmm6 -; SSSE3-NEXT: por %xmm4, %xmm6 -; SSSE3-NEXT: pxor %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm3 -; SSSE3-NEXT: pandn %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm1, %xmm3 ; SSSE3-NEXT: por %xmm3, %xmm0 ; SSSE3-NEXT: packssdw %xmm6, %xmm0 ; SSSE3-NEXT: packssdw %xmm0, %xmm0 @@ -2485,20 +2485,20 @@ ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 @@ -2521,20 +2521,20 @@ ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm3, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067840,18446744071562067840] -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 +; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm0, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm3 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: retq ; @@ -2797,38 +2797,38 @@ ; SSE2-NEXT: por %xmm1, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: movdqa %xmm4, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm4, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq @@ -2863,38 +2863,38 @@ ; SSSE3-NEXT: por %xmm1, %xmm4 ; SSSE3-NEXT: pand %xmm4, %xmm0 ; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [18446744071562067840,18446744071562067840] -; SSSE3-NEXT: movdqa %xmm0, %xmm6 +; SSSE3-NEXT: movdqa %xmm4, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pandn %xmm1, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm0 ; SSSE3-NEXT: pxor %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm3, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pandn %xmm1, %xmm4 +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm1, %xmm4 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v4i64_v4i8: @@ -4913,12 +4913,12 @@ ; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 @@ -4932,12 +4932,12 @@ ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: pandn %xmm1, %xmm2 -; SSSE3-NEXT: por %xmm0, %xmm2 +; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pandn %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pandn %xmm1, %xmm2 ; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: retq Index: llvm/test/CodeGen/X86/vector-trunc-usat.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -2048,18 +2048,17 @@ ; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v2i64_v2i8: @@ -2258,37 +2257,37 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) { ; SSE2-LABEL: trunc_usat_v4i64_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] ; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm4, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq @@ -3607,17 +3606,16 @@ define <4 x i8> @trunc_usat_v4i32_v4i8(<4 x i32> %a0) { ; SSE2-LABEL: trunc_usat_v4i32_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v4i32_v4i8: Index: llvm/test/CodeGen/X86/vector-tzcnt-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-tzcnt-128.ll +++ llvm/test/CodeGen/X86/vector-tzcnt-128.ll @@ -435,16 +435,15 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv4i32: @@ -464,16 +463,15 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv4i32: @@ -678,16 +676,15 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv4i32u: @@ -707,16 +704,15 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: psadbw %xmm1, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: packuswb %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv4i32u: @@ -921,10 +917,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: retq @@ -946,10 +942,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllw $8, %xmm1 ; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: retq @@ -1097,10 +1093,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 ; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: retq @@ -1122,10 +1118,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllw $8, %xmm1 ; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: psrlw $8, %xmm0 ; SSE3-NEXT: retq Index: llvm/test/CodeGen/X86/vselect-packss.ll =================================================================== --- llvm/test/CodeGen/X86/vselect-packss.ll +++ llvm/test/CodeGen/X86/vselect-packss.ll @@ -225,15 +225,14 @@ ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packssdw %xmm2, %xmm1 -; SSE2-NEXT: packssdw %xmm3, %xmm1 -; SSE2-NEXT: packsswb %xmm5, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: packssdw %xmm2, %xmm0 +; SSE2-NEXT: packssdw %xmm3, %xmm0 +; SSE2-NEXT: packsswb %xmm5, %xmm0 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: vselect_packss_v16i64: