diff --git a/i/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/i/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -714,6 +714,8 @@ return true; } + unsigned getCustomCtpopCost(EVT VT, ISD::CondCode Cond) const override; + bool isEqualityCmpFoldedWithSignedCmp() const override { return false; } diff --git a/i/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/i/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1396,6 +1396,14 @@ return VT.isScalarInteger(); } +unsigned PPCTargetLowering::getCustomCtpopCost(EVT VT, + ISD::CondCode Cond) const { + // FIXME: Tune the result when emulating CTPOP via POPCNTD (fast and slow). + if (Subtarget.hasPOPCNTD()) + return 1; + return 8; +} + const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((PPCISD::NodeType)Opcode) { case PPCISD::FIRST_NUMBER: break; diff --git a/i/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/i/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -983,6 +983,8 @@ bool isCtlzFast() const override; + unsigned getCustomCtpopCost(EVT VT, ISD::CondCode Cond) const override; + bool hasBitPreservingFPLogic(EVT VT) const override { return VT == MVT::f32 || VT == MVT::f64 || VT.isVector(); } diff --git a/i/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/i/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5331,6 +5331,30 @@ return Subtarget.hasFastLZCNT(); } +unsigned X86TargetLowering::getCustomCtpopCost(EVT VT, ISD::CondCode Cond) const { + assert(VT.isVector()); + + EVT EltVT = VT.getVectorElementType(); + + // FIXME: Tune the result when emulating CTPOP via BITALG or VPOPCNTDQ. + if (Subtarget.hasBITALG() || Subtarget.hasVPOPCNTDQ()) + return 1; + + // CTPOP emulation is easier if byte-wise instructions exist. + if ((VT.is512BitVector() && Subtarget.hasBWI()) || + (VT.is256BitVector() && Subtarget.hasAVX2()) || + (VT.is128BitVector() && Subtarget.hasSSE2())) { + return EltVT == MVT::i8 ? 4 : 6; + } + + if (VT.is512BitVector()) { + if (EltVT == MVT::i64) return 10; + if (EltVT == MVT::i32) return 14; + } + + return EltVT == MVT::i8 ? 2 : 4; +} + bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial( const Instruction &AndI) const { return true; diff --git a/i/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll --- a/i/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll +++ b/llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll @@ -106,52 +106,26 @@ define <16 x i8> @ugt_2_v16i8(<16 x i8> %0) { ; PWR5-LABEL: ugt_2_v16i8: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI2_0@toc@ha -; PWR5-NEXT: vspltisb 3, 1 -; PWR5-NEXT: addi 3, 3, .LCPI2_0@toc@l -; PWR5-NEXT: vsrb 3, 2, 3 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI2_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI2_1@toc@l -; PWR5-NEXT: vspltisb 5, 2 -; PWR5-NEXT: vand 3, 3, 4 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: vsububm 2, 2, 3 -; PWR5-NEXT: vand 3, 2, 4 -; PWR5-NEXT: vsrb 2, 2, 5 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vaddubm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 4 -; PWR5-NEXT: vaddubm 2, 3, 2 -; PWR5-NEXT: vsrb 3, 2, 4 -; PWR5-NEXT: vspltisb 4, 15 -; PWR5-NEXT: vaddubm 2, 2, 3 -; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vcmpgtub 2, 2, 5 +; PWR5-NEXT: vaddubm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequb 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_2_v16i8: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI2_0@toc@ha -; PWR6-NEXT: vspltisb 3, 1 -; PWR6-NEXT: addi 3, 3, .LCPI2_0@toc@l -; PWR6-NEXT: vsrb 3, 2, 3 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI2_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI2_1@toc@l -; PWR6-NEXT: vspltisb 5, 2 -; PWR6-NEXT: vand 3, 3, 4 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: vsububm 2, 2, 3 -; PWR6-NEXT: vand 3, 2, 4 -; PWR6-NEXT: vsrb 2, 2, 5 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vaddubm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 4 -; PWR6-NEXT: vaddubm 2, 3, 2 -; PWR6-NEXT: vsrb 3, 2, 4 -; PWR6-NEXT: vspltisb 4, 15 -; PWR6-NEXT: vaddubm 2, 2, 3 -; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vcmpgtub 2, 2, 5 +; PWR6-NEXT: vaddubm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequb 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_2_v16i8: @@ -201,54 +175,24 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) { ; PWR5-LABEL: ult_3_v16i8: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI3_0@toc@ha -; PWR5-NEXT: vspltisb 3, 1 -; PWR5-NEXT: addi 3, 3, .LCPI3_0@toc@l -; PWR5-NEXT: vsrb 3, 2, 3 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI3_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI3_1@toc@l -; PWR5-NEXT: vspltisb 5, 2 -; PWR5-NEXT: vand 3, 3, 4 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: vsububm 2, 2, 3 -; PWR5-NEXT: vand 3, 2, 4 -; PWR5-NEXT: vsrb 2, 2, 5 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vaddubm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 4 -; PWR5-NEXT: vaddubm 2, 3, 2 -; PWR5-NEXT: vsrb 3, 2, 4 -; PWR5-NEXT: vspltisb 4, 15 -; PWR5-NEXT: vaddubm 2, 2, 3 -; PWR5-NEXT: vspltisb 3, 3 -; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vcmpgtub 2, 3, 2 +; PWR5-NEXT: vaddubm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequb 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_3_v16i8: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI3_0@toc@ha -; PWR6-NEXT: vspltisb 3, 1 -; PWR6-NEXT: addi 3, 3, .LCPI3_0@toc@l -; PWR6-NEXT: vsrb 3, 2, 3 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI3_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI3_1@toc@l -; PWR6-NEXT: vspltisb 5, 2 -; PWR6-NEXT: vand 3, 3, 4 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: vsububm 2, 2, 3 -; PWR6-NEXT: vand 3, 2, 4 -; PWR6-NEXT: vsrb 2, 2, 5 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vaddubm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 4 -; PWR6-NEXT: vaddubm 2, 3, 2 -; PWR6-NEXT: vsrb 3, 2, 4 -; PWR6-NEXT: vspltisb 4, 15 -; PWR6-NEXT: vaddubm 2, 2, 3 -; PWR6-NEXT: vspltisb 3, 3 -; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vcmpgtub 2, 3, 2 +; PWR6-NEXT: vaddubm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequb 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_3_v16i8: @@ -299,54 +243,30 @@ define <16 x i8> @ugt_3_v16i8(<16 x i8> %0) { ; PWR5-LABEL: ugt_3_v16i8: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI4_0@toc@ha -; PWR5-NEXT: vspltisb 3, 1 -; PWR5-NEXT: addi 3, 3, .LCPI4_0@toc@l -; PWR5-NEXT: vsrb 3, 2, 3 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI4_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI4_1@toc@l -; PWR5-NEXT: vspltisb 5, 2 -; PWR5-NEXT: vand 3, 3, 4 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: vsububm 2, 2, 3 -; PWR5-NEXT: vand 3, 2, 4 -; PWR5-NEXT: vsrb 2, 2, 5 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vaddubm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 4 -; PWR5-NEXT: vaddubm 2, 3, 2 -; PWR5-NEXT: vsrb 3, 2, 4 -; PWR5-NEXT: vspltisb 4, 15 -; PWR5-NEXT: vaddubm 2, 2, 3 -; PWR5-NEXT: vspltisb 3, 3 +; PWR5-NEXT: vaddubm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vcmpgtub 2, 2, 3 +; PWR5-NEXT: vaddubm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequb 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_3_v16i8: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI4_0@toc@ha -; PWR6-NEXT: vspltisb 3, 1 -; PWR6-NEXT: addi 3, 3, .LCPI4_0@toc@l -; PWR6-NEXT: vsrb 3, 2, 3 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI4_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI4_1@toc@l -; PWR6-NEXT: vspltisb 5, 2 -; PWR6-NEXT: vand 3, 3, 4 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: vsububm 2, 2, 3 -; PWR6-NEXT: vand 3, 2, 4 -; PWR6-NEXT: vsrb 2, 2, 5 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vaddubm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 4 -; PWR6-NEXT: vaddubm 2, 3, 2 -; PWR6-NEXT: vsrb 3, 2, 4 -; PWR6-NEXT: vspltisb 4, 15 -; PWR6-NEXT: vaddubm 2, 2, 3 -; PWR6-NEXT: vspltisb 3, 3 +; PWR6-NEXT: vaddubm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vcmpgtub 2, 2, 3 +; PWR6-NEXT: vaddubm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequb 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_3_v16i8: @@ -397,52 +317,28 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) { ; PWR5-LABEL: ult_4_v16i8: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI5_0@toc@ha -; PWR5-NEXT: vspltisb 3, 1 -; PWR5-NEXT: addi 3, 3, .LCPI5_0@toc@l -; PWR5-NEXT: vsrb 3, 2, 3 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI5_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI5_1@toc@l -; PWR5-NEXT: vspltisb 5, 2 -; PWR5-NEXT: vand 3, 3, 4 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: vsububm 2, 2, 3 -; PWR5-NEXT: vand 3, 2, 4 -; PWR5-NEXT: vsrb 2, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vaddubm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 4 -; PWR5-NEXT: vaddubm 2, 3, 2 -; PWR5-NEXT: vsrb 3, 2, 4 -; PWR5-NEXT: vaddubm 2, 2, 3 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vcmpgtub 2, 4, 2 +; PWR5-NEXT: vaddubm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vaddubm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequb 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_4_v16i8: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI5_0@toc@ha -; PWR6-NEXT: vspltisb 3, 1 -; PWR6-NEXT: addi 3, 3, .LCPI5_0@toc@l -; PWR6-NEXT: vsrb 3, 2, 3 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI5_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI5_1@toc@l -; PWR6-NEXT: vspltisb 5, 2 -; PWR6-NEXT: vand 3, 3, 4 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: vsububm 2, 2, 3 -; PWR6-NEXT: vand 3, 2, 4 -; PWR6-NEXT: vsrb 2, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vaddubm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 4 -; PWR6-NEXT: vaddubm 2, 3, 2 -; PWR6-NEXT: vsrb 3, 2, 4 -; PWR6-NEXT: vaddubm 2, 2, 3 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vcmpgtub 2, 4, 2 +; PWR6-NEXT: vaddubm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vaddubm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequb 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_4_v16i8: @@ -492,52 +388,34 @@ define <16 x i8> @ugt_4_v16i8(<16 x i8> %0) { ; PWR5-LABEL: ugt_4_v16i8: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI6_0@toc@ha -; PWR5-NEXT: vspltisb 3, 1 -; PWR5-NEXT: addi 3, 3, .LCPI6_0@toc@l -; PWR5-NEXT: vsrb 3, 2, 3 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI6_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI6_1@toc@l -; PWR5-NEXT: vspltisb 5, 2 -; PWR5-NEXT: vand 3, 3, 4 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: vsububm 2, 2, 3 -; PWR5-NEXT: vand 3, 2, 4 -; PWR5-NEXT: vsrb 2, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vaddubm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 4 -; PWR5-NEXT: vaddubm 2, 3, 2 -; PWR5-NEXT: vsrb 3, 2, 4 -; PWR5-NEXT: vaddubm 2, 2, 3 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vcmpgtub 2, 2, 4 +; PWR5-NEXT: vaddubm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vaddubm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vaddubm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequb 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_4_v16i8: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI6_0@toc@ha -; PWR6-NEXT: vspltisb 3, 1 -; PWR6-NEXT: addi 3, 3, .LCPI6_0@toc@l -; PWR6-NEXT: vsrb 3, 2, 3 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI6_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI6_1@toc@l -; PWR6-NEXT: vspltisb 5, 2 -; PWR6-NEXT: vand 3, 3, 4 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: vsububm 2, 2, 3 -; PWR6-NEXT: vand 3, 2, 4 -; PWR6-NEXT: vsrb 2, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vaddubm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 4 -; PWR6-NEXT: vaddubm 2, 3, 2 -; PWR6-NEXT: vsrb 3, 2, 4 -; PWR6-NEXT: vaddubm 2, 2, 3 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vcmpgtub 2, 2, 4 +; PWR6-NEXT: vaddubm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vaddubm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vaddubm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequb 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_4_v16i8: @@ -587,54 +465,32 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) { ; PWR5-LABEL: ult_5_v16i8: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI7_0@toc@ha -; PWR5-NEXT: vspltisb 3, 1 -; PWR5-NEXT: addi 3, 3, .LCPI7_0@toc@l -; PWR5-NEXT: vsrb 3, 2, 3 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI7_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI7_1@toc@l -; PWR5-NEXT: vspltisb 5, 2 -; PWR5-NEXT: vand 3, 3, 4 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: vsububm 2, 2, 3 -; PWR5-NEXT: vand 3, 2, 4 -; PWR5-NEXT: vsrb 2, 2, 5 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vaddubm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 4 -; PWR5-NEXT: vaddubm 2, 3, 2 -; PWR5-NEXT: vsrb 3, 2, 4 -; PWR5-NEXT: vspltisb 4, 15 -; PWR5-NEXT: vaddubm 2, 2, 3 -; PWR5-NEXT: vspltisb 3, 5 +; PWR5-NEXT: vaddubm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vcmpgtub 2, 3, 2 +; PWR5-NEXT: vaddubm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vaddubm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequb 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_5_v16i8: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI7_0@toc@ha -; PWR6-NEXT: vspltisb 3, 1 -; PWR6-NEXT: addi 3, 3, .LCPI7_0@toc@l -; PWR6-NEXT: vsrb 3, 2, 3 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI7_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI7_1@toc@l -; PWR6-NEXT: vspltisb 5, 2 -; PWR6-NEXT: vand 3, 3, 4 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: vsububm 2, 2, 3 -; PWR6-NEXT: vand 3, 2, 4 -; PWR6-NEXT: vsrb 2, 2, 5 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vaddubm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 4 -; PWR6-NEXT: vaddubm 2, 3, 2 -; PWR6-NEXT: vsrb 3, 2, 4 -; PWR6-NEXT: vspltisb 4, 15 -; PWR6-NEXT: vaddubm 2, 2, 3 -; PWR6-NEXT: vspltisb 3, 5 +; PWR6-NEXT: vaddubm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vcmpgtub 2, 3, 2 +; PWR6-NEXT: vaddubm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vaddubm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequb 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_5_v16i8: @@ -685,54 +541,38 @@ define <16 x i8> @ugt_5_v16i8(<16 x i8> %0) { ; PWR5-LABEL: ugt_5_v16i8: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI8_0@toc@ha -; PWR5-NEXT: vspltisb 3, 1 -; PWR5-NEXT: addi 3, 3, .LCPI8_0@toc@l -; PWR5-NEXT: vsrb 3, 2, 3 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI8_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI8_1@toc@l -; PWR5-NEXT: vspltisb 5, 2 -; PWR5-NEXT: vand 3, 3, 4 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: vsububm 2, 2, 3 -; PWR5-NEXT: vand 3, 2, 4 -; PWR5-NEXT: vsrb 2, 2, 5 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vaddubm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 4 -; PWR5-NEXT: vaddubm 2, 3, 2 -; PWR5-NEXT: vsrb 3, 2, 4 -; PWR5-NEXT: vspltisb 4, 15 -; PWR5-NEXT: vaddubm 2, 2, 3 -; PWR5-NEXT: vspltisb 3, 5 +; PWR5-NEXT: vaddubm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vcmpgtub 2, 2, 3 +; PWR5-NEXT: vaddubm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vaddubm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vaddubm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequb 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_5_v16i8: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI8_0@toc@ha -; PWR6-NEXT: vspltisb 3, 1 -; PWR6-NEXT: addi 3, 3, .LCPI8_0@toc@l -; PWR6-NEXT: vsrb 3, 2, 3 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI8_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI8_1@toc@l -; PWR6-NEXT: vspltisb 5, 2 -; PWR6-NEXT: vand 3, 3, 4 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: vsububm 2, 2, 3 -; PWR6-NEXT: vand 3, 2, 4 -; PWR6-NEXT: vsrb 2, 2, 5 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vaddubm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 4 -; PWR6-NEXT: vaddubm 2, 3, 2 -; PWR6-NEXT: vsrb 3, 2, 4 -; PWR6-NEXT: vspltisb 4, 15 -; PWR6-NEXT: vaddubm 2, 2, 3 -; PWR6-NEXT: vspltisb 3, 5 +; PWR6-NEXT: vaddubm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vcmpgtub 2, 2, 3 +; PWR6-NEXT: vaddubm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vaddubm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vaddubm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequb 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_5_v16i8: @@ -783,54 +623,36 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) { ; PWR5-LABEL: ult_6_v16i8: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI9_0@toc@ha -; PWR5-NEXT: vspltisb 3, 1 -; PWR5-NEXT: addi 3, 3, .LCPI9_0@toc@l -; PWR5-NEXT: vsrb 3, 2, 3 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI9_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI9_1@toc@l -; PWR5-NEXT: vspltisb 5, 2 -; PWR5-NEXT: vand 3, 3, 4 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: vsububm 2, 2, 3 -; PWR5-NEXT: vand 3, 2, 4 -; PWR5-NEXT: vsrb 2, 2, 5 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vaddubm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 4 -; PWR5-NEXT: vaddubm 2, 3, 2 -; PWR5-NEXT: vsrb 3, 2, 4 -; PWR5-NEXT: vspltisb 4, 15 -; PWR5-NEXT: vaddubm 2, 2, 3 -; PWR5-NEXT: vspltisb 3, 6 +; PWR5-NEXT: vaddubm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vcmpgtub 2, 3, 2 +; PWR5-NEXT: vaddubm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vaddubm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vaddubm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequb 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_6_v16i8: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI9_0@toc@ha -; PWR6-NEXT: vspltisb 3, 1 -; PWR6-NEXT: addi 3, 3, .LCPI9_0@toc@l -; PWR6-NEXT: vsrb 3, 2, 3 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI9_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI9_1@toc@l -; PWR6-NEXT: vspltisb 5, 2 -; PWR6-NEXT: vand 3, 3, 4 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: vsububm 2, 2, 3 -; PWR6-NEXT: vand 3, 2, 4 -; PWR6-NEXT: vsrb 2, 2, 5 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vaddubm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 4 -; PWR6-NEXT: vaddubm 2, 3, 2 -; PWR6-NEXT: vsrb 3, 2, 4 -; PWR6-NEXT: vspltisb 4, 15 -; PWR6-NEXT: vaddubm 2, 2, 3 -; PWR6-NEXT: vspltisb 3, 6 +; PWR6-NEXT: vaddubm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vcmpgtub 2, 3, 2 +; PWR6-NEXT: vaddubm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vaddubm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vaddubm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequb 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_6_v16i8: @@ -881,54 +703,42 @@ define <16 x i8> @ugt_6_v16i8(<16 x i8> %0) { ; PWR5-LABEL: ugt_6_v16i8: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI10_0@toc@ha -; PWR5-NEXT: vspltisb 3, 1 -; PWR5-NEXT: addi 3, 3, .LCPI10_0@toc@l -; PWR5-NEXT: vsrb 3, 2, 3 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI10_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI10_1@toc@l -; PWR5-NEXT: vspltisb 5, 2 -; PWR5-NEXT: vand 3, 3, 4 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: vsububm 2, 2, 3 -; PWR5-NEXT: vand 3, 2, 4 -; PWR5-NEXT: vsrb 2, 2, 5 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vaddubm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 4 -; PWR5-NEXT: vaddubm 2, 3, 2 -; PWR5-NEXT: vsrb 3, 2, 4 -; PWR5-NEXT: vspltisb 4, 15 -; PWR5-NEXT: vaddubm 2, 2, 3 -; PWR5-NEXT: vspltisb 3, 6 +; PWR5-NEXT: vaddubm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vcmpgtub 2, 2, 3 +; PWR5-NEXT: vaddubm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vaddubm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vaddubm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vaddubm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequb 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_6_v16i8: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI10_0@toc@ha -; PWR6-NEXT: vspltisb 3, 1 -; PWR6-NEXT: addi 3, 3, .LCPI10_0@toc@l -; PWR6-NEXT: vsrb 3, 2, 3 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI10_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI10_1@toc@l -; PWR6-NEXT: vspltisb 5, 2 -; PWR6-NEXT: vand 3, 3, 4 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: vsububm 2, 2, 3 -; PWR6-NEXT: vand 3, 2, 4 -; PWR6-NEXT: vsrb 2, 2, 5 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vaddubm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 4 -; PWR6-NEXT: vaddubm 2, 3, 2 -; PWR6-NEXT: vsrb 3, 2, 4 -; PWR6-NEXT: vspltisb 4, 15 -; PWR6-NEXT: vaddubm 2, 2, 3 -; PWR6-NEXT: vspltisb 3, 6 +; PWR6-NEXT: vaddubm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vcmpgtub 2, 2, 3 +; PWR6-NEXT: vaddubm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vaddubm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vaddubm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vaddubm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequb 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_6_v16i8: @@ -979,54 +789,40 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) { ; PWR5-LABEL: ult_7_v16i8: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI11_0@toc@ha -; PWR5-NEXT: vspltisb 3, 1 -; PWR5-NEXT: addi 3, 3, .LCPI11_0@toc@l -; PWR5-NEXT: vsrb 3, 2, 3 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI11_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI11_1@toc@l -; PWR5-NEXT: vspltisb 5, 2 -; PWR5-NEXT: vand 3, 3, 4 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: vsububm 2, 2, 3 -; PWR5-NEXT: vand 3, 2, 4 -; PWR5-NEXT: vsrb 2, 2, 5 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vaddubm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 4 -; PWR5-NEXT: vaddubm 2, 3, 2 -; PWR5-NEXT: vsrb 3, 2, 4 -; PWR5-NEXT: vspltisb 4, 15 -; PWR5-NEXT: vaddubm 2, 2, 3 -; PWR5-NEXT: vspltisb 3, 7 +; PWR5-NEXT: vaddubm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vcmpgtub 2, 3, 2 +; PWR5-NEXT: vaddubm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vaddubm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vaddubm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vaddubm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequb 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_7_v16i8: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI11_0@toc@ha -; PWR6-NEXT: vspltisb 3, 1 -; PWR6-NEXT: addi 3, 3, .LCPI11_0@toc@l -; PWR6-NEXT: vsrb 3, 2, 3 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI11_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI11_1@toc@l -; PWR6-NEXT: vspltisb 5, 2 -; PWR6-NEXT: vand 3, 3, 4 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: vsububm 2, 2, 3 -; PWR6-NEXT: vand 3, 2, 4 -; PWR6-NEXT: vsrb 2, 2, 5 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vaddubm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 4 -; PWR6-NEXT: vaddubm 2, 3, 2 -; PWR6-NEXT: vsrb 3, 2, 4 -; PWR6-NEXT: vspltisb 4, 15 -; PWR6-NEXT: vaddubm 2, 2, 3 -; PWR6-NEXT: vspltisb 3, 7 +; PWR6-NEXT: vaddubm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vcmpgtub 2, 3, 2 +; PWR6-NEXT: vaddubm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vaddubm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vaddubm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vaddubm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequb 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_7_v16i8: @@ -1174,62 +970,26 @@ define <8 x i16> @ugt_2_v8i16(<8 x i16> %0) { ; PWR5-LABEL: ugt_2_v8i16: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI14_0@toc@ha -; PWR5-NEXT: vspltish 3, 1 -; PWR5-NEXT: addi 3, 3, .LCPI14_0@toc@l -; PWR5-NEXT: vsrh 3, 2, 3 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI14_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI14_1@toc@l -; PWR5-NEXT: vspltish 5, 2 -; PWR5-NEXT: vand 3, 3, 4 -; PWR5-NEXT: lvx 4, 0, 3 -; PWR5-NEXT: vsubuhm 2, 2, 3 -; PWR5-NEXT: vand 3, 2, 4 -; PWR5-NEXT: vsrh 2, 2, 5 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduhm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vadduhm 2, 3, 2 -; PWR5-NEXT: vspltish 3, 4 -; PWR5-NEXT: vsrh 3, 2, 3 -; PWR5-NEXT: vadduhm 2, 2, 3 -; PWR5-NEXT: vspltisb 3, 15 +; PWR5-NEXT: vadduhm 3, 2, 3 ; PWR5-NEXT: vxor 4, 4, 4 ; PWR5-NEXT: vand 2, 2, 3 -; PWR5-NEXT: vspltisb 3, 1 -; PWR5-NEXT: vmladduhm 2, 2, 3, 4 -; PWR5-NEXT: vspltish 3, 8 -; PWR5-NEXT: vsrh 2, 2, 3 -; PWR5-NEXT: vcmpgtuh 2, 2, 5 +; PWR5-NEXT: vcmpequh 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_2_v8i16: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI14_0@toc@ha -; PWR6-NEXT: vspltish 3, 1 -; PWR6-NEXT: addi 3, 3, .LCPI14_0@toc@l -; PWR6-NEXT: vsrh 3, 2, 3 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI14_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI14_1@toc@l -; PWR6-NEXT: vspltish 5, 2 -; PWR6-NEXT: vand 3, 3, 4 -; PWR6-NEXT: lvx 4, 0, 3 -; PWR6-NEXT: vsubuhm 2, 2, 3 -; PWR6-NEXT: vand 3, 2, 4 -; PWR6-NEXT: vsrh 2, 2, 5 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduhm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vadduhm 2, 3, 2 -; PWR6-NEXT: vspltish 3, 4 -; PWR6-NEXT: vsrh 3, 2, 3 -; PWR6-NEXT: vadduhm 2, 2, 3 -; PWR6-NEXT: vspltisb 3, 15 +; PWR6-NEXT: vadduhm 3, 2, 3 ; PWR6-NEXT: vxor 4, 4, 4 ; PWR6-NEXT: vand 2, 2, 3 -; PWR6-NEXT: vspltisb 3, 1 -; PWR6-NEXT: vmladduhm 2, 2, 3, 4 -; PWR6-NEXT: vspltish 3, 8 -; PWR6-NEXT: vsrh 2, 2, 3 -; PWR6-NEXT: vcmpgtuh 2, 2, 5 +; PWR6-NEXT: vcmpequh 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_2_v8i16: @@ -1284,64 +1044,24 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) { ; PWR5-LABEL: ult_3_v8i16: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI15_0@toc@ha -; PWR5-NEXT: vspltish 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI15_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI15_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI15_1@toc@l -; PWR5-NEXT: vsrh 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 2 -; PWR5-NEXT: vsubuhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 4 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vmladduhm 2, 2, 4, 3 -; PWR5-NEXT: vspltish 3, 8 -; PWR5-NEXT: vsrh 2, 2, 3 -; PWR5-NEXT: vspltish 3, 3 -; PWR5-NEXT: vcmpgtuh 2, 3, 2 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequh 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_3_v8i16: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI15_0@toc@ha -; PWR6-NEXT: vspltish 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI15_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI15_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI15_1@toc@l -; PWR6-NEXT: vsrh 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 2 -; PWR6-NEXT: vsubuhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 4 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vmladduhm 2, 2, 4, 3 -; PWR6-NEXT: vspltish 3, 8 -; PWR6-NEXT: vsrh 2, 2, 3 -; PWR6-NEXT: vspltish 3, 3 -; PWR6-NEXT: vcmpgtuh 2, 3, 2 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequh 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_3_v8i16: @@ -1397,64 +1117,30 @@ define <8 x i16> @ugt_3_v8i16(<8 x i16> %0) { ; PWR5-LABEL: ugt_3_v8i16: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI16_0@toc@ha -; PWR5-NEXT: vspltish 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI16_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI16_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI16_1@toc@l -; PWR5-NEXT: vsrh 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 2 -; PWR5-NEXT: vsubuhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 4 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vmladduhm 2, 2, 4, 3 -; PWR5-NEXT: vspltish 3, 8 -; PWR5-NEXT: vsrh 2, 2, 3 -; PWR5-NEXT: vspltish 3, 3 -; PWR5-NEXT: vcmpgtuh 2, 2, 3 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequh 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_3_v8i16: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI16_0@toc@ha -; PWR6-NEXT: vspltish 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI16_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI16_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI16_1@toc@l -; PWR6-NEXT: vsrh 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 2 -; PWR6-NEXT: vsubuhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 4 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vmladduhm 2, 2, 4, 3 -; PWR6-NEXT: vspltish 3, 8 -; PWR6-NEXT: vsrh 2, 2, 3 -; PWR6-NEXT: vspltish 3, 3 -; PWR6-NEXT: vcmpgtuh 2, 2, 3 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequh 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_3_v8i16: @@ -1510,62 +1196,28 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) { ; PWR5-LABEL: ult_4_v8i16: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI17_0@toc@ha -; PWR5-NEXT: vspltish 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI17_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI17_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI17_1@toc@l -; PWR5-NEXT: vsrh 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 2 -; PWR5-NEXT: vsubuhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 4 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 15 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduhm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vmladduhm 2, 2, 4, 3 -; PWR5-NEXT: vspltish 3, 8 -; PWR5-NEXT: vsrh 2, 2, 3 -; PWR5-NEXT: vcmpgtuh 2, 5, 2 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequh 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_4_v8i16: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI17_0@toc@ha -; PWR6-NEXT: vspltish 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI17_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI17_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI17_1@toc@l -; PWR6-NEXT: vsrh 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 2 -; PWR6-NEXT: vsubuhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 4 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 15 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduhm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vmladduhm 2, 2, 4, 3 -; PWR6-NEXT: vspltish 3, 8 -; PWR6-NEXT: vsrh 2, 2, 3 -; PWR6-NEXT: vcmpgtuh 2, 5, 2 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequh 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_4_v8i16: @@ -1620,62 +1272,34 @@ define <8 x i16> @ugt_4_v8i16(<8 x i16> %0) { ; PWR5-LABEL: ugt_4_v8i16: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI18_0@toc@ha -; PWR5-NEXT: vspltish 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI18_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI18_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI18_1@toc@l -; PWR5-NEXT: vsrh 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 2 -; PWR5-NEXT: vsubuhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 4 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 15 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduhm 4, 2, 3 ; PWR5-NEXT: vand 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vmladduhm 2, 2, 4, 3 -; PWR5-NEXT: vspltish 3, 8 -; PWR5-NEXT: vsrh 2, 2, 3 -; PWR5-NEXT: vcmpgtuh 2, 2, 5 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequh 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_4_v8i16: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI18_0@toc@ha -; PWR6-NEXT: vspltish 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI18_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI18_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI18_1@toc@l -; PWR6-NEXT: vsrh 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 2 -; PWR6-NEXT: vsubuhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 4 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 15 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduhm 4, 2, 3 ; PWR6-NEXT: vand 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vmladduhm 2, 2, 4, 3 -; PWR6-NEXT: vspltish 3, 8 -; PWR6-NEXT: vsrh 2, 2, 3 -; PWR6-NEXT: vcmpgtuh 2, 2, 5 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequh 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_4_v8i16: @@ -1730,64 +1354,32 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) { ; PWR5-LABEL: ult_5_v8i16: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI19_0@toc@ha -; PWR5-NEXT: vspltish 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI19_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI19_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI19_1@toc@l -; PWR5-NEXT: vsrh 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 2 -; PWR5-NEXT: vsubuhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 4 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vmladduhm 2, 2, 4, 3 -; PWR5-NEXT: vspltish 3, 8 -; PWR5-NEXT: vsrh 2, 2, 3 -; PWR5-NEXT: vspltish 3, 5 -; PWR5-NEXT: vcmpgtuh 2, 3, 2 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequh 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_5_v8i16: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI19_0@toc@ha -; PWR6-NEXT: vspltish 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI19_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI19_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI19_1@toc@l -; PWR6-NEXT: vsrh 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 2 -; PWR6-NEXT: vsubuhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 4 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vmladduhm 2, 2, 4, 3 -; PWR6-NEXT: vspltish 3, 8 -; PWR6-NEXT: vsrh 2, 2, 3 -; PWR6-NEXT: vspltish 3, 5 -; PWR6-NEXT: vcmpgtuh 2, 3, 2 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequh 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_5_v8i16: @@ -1843,64 +1435,38 @@ define <8 x i16> @ugt_5_v8i16(<8 x i16> %0) { ; PWR5-LABEL: ugt_5_v8i16: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI20_0@toc@ha -; PWR5-NEXT: vspltish 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI20_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI20_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI20_1@toc@l -; PWR5-NEXT: vsrh 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 2 -; PWR5-NEXT: vsubuhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 4 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vmladduhm 2, 2, 4, 3 -; PWR5-NEXT: vspltish 3, 8 -; PWR5-NEXT: vsrh 2, 2, 3 -; PWR5-NEXT: vspltish 3, 5 -; PWR5-NEXT: vcmpgtuh 2, 2, 3 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequh 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_5_v8i16: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI20_0@toc@ha -; PWR6-NEXT: vspltish 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI20_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI20_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI20_1@toc@l -; PWR6-NEXT: vsrh 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 2 -; PWR6-NEXT: vsubuhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 4 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vmladduhm 2, 2, 4, 3 -; PWR6-NEXT: vspltish 3, 8 -; PWR6-NEXT: vsrh 2, 2, 3 -; PWR6-NEXT: vspltish 3, 5 -; PWR6-NEXT: vcmpgtuh 2, 2, 3 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequh 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_5_v8i16: @@ -1956,64 +1522,36 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) { ; PWR5-LABEL: ult_6_v8i16: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI21_0@toc@ha -; PWR5-NEXT: vspltish 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI21_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI21_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI21_1@toc@l -; PWR5-NEXT: vsrh 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 2 -; PWR5-NEXT: vsubuhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 4 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vmladduhm 2, 2, 4, 3 -; PWR5-NEXT: vspltish 3, 8 -; PWR5-NEXT: vsrh 2, 2, 3 -; PWR5-NEXT: vspltish 3, 6 -; PWR5-NEXT: vcmpgtuh 2, 3, 2 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequh 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_6_v8i16: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI21_0@toc@ha -; PWR6-NEXT: vspltish 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI21_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI21_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI21_1@toc@l -; PWR6-NEXT: vsrh 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 2 -; PWR6-NEXT: vsubuhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 4 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vmladduhm 2, 2, 4, 3 -; PWR6-NEXT: vspltish 3, 8 -; PWR6-NEXT: vsrh 2, 2, 3 -; PWR6-NEXT: vspltish 3, 6 -; PWR6-NEXT: vcmpgtuh 2, 3, 2 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequh 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_6_v8i16: @@ -2069,64 +1607,42 @@ define <8 x i16> @ugt_6_v8i16(<8 x i16> %0) { ; PWR5-LABEL: ugt_6_v8i16: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI22_0@toc@ha -; PWR5-NEXT: vspltish 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI22_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI22_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI22_1@toc@l -; PWR5-NEXT: vsrh 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 2 -; PWR5-NEXT: vsubuhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 4 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vmladduhm 2, 2, 4, 3 -; PWR5-NEXT: vspltish 3, 8 -; PWR5-NEXT: vsrh 2, 2, 3 -; PWR5-NEXT: vspltish 3, 6 -; PWR5-NEXT: vcmpgtuh 2, 2, 3 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequh 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_6_v8i16: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI22_0@toc@ha -; PWR6-NEXT: vspltish 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI22_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI22_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI22_1@toc@l -; PWR6-NEXT: vsrh 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 2 -; PWR6-NEXT: vsubuhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 4 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vmladduhm 2, 2, 4, 3 -; PWR6-NEXT: vspltish 3, 8 -; PWR6-NEXT: vsrh 2, 2, 3 -; PWR6-NEXT: vspltish 3, 6 -; PWR6-NEXT: vcmpgtuh 2, 2, 3 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequh 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_6_v8i16: @@ -2182,64 +1698,40 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) { ; PWR5-LABEL: ult_7_v8i16: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI23_0@toc@ha -; PWR5-NEXT: vspltish 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI23_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI23_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI23_1@toc@l -; PWR5-NEXT: vsrh 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 2 -; PWR5-NEXT: vsubuhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 4 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vmladduhm 2, 2, 4, 3 -; PWR5-NEXT: vspltish 3, 8 -; PWR5-NEXT: vsrh 2, 2, 3 -; PWR5-NEXT: vspltish 3, 7 -; PWR5-NEXT: vcmpgtuh 2, 3, 2 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequh 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_7_v8i16: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI23_0@toc@ha -; PWR6-NEXT: vspltish 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI23_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI23_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI23_1@toc@l -; PWR6-NEXT: vsrh 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 2 -; PWR6-NEXT: vsubuhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 4 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vmladduhm 2, 2, 4, 3 -; PWR6-NEXT: vspltish 3, 8 -; PWR6-NEXT: vsrh 2, 2, 3 -; PWR6-NEXT: vspltish 3, 7 -; PWR6-NEXT: vcmpgtuh 2, 3, 2 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequh 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_7_v8i16: @@ -2295,64 +1787,46 @@ define <8 x i16> @ugt_7_v8i16(<8 x i16> %0) { ; PWR5-LABEL: ugt_7_v8i16: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI24_0@toc@ha -; PWR5-NEXT: vspltish 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI24_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI24_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI24_1@toc@l -; PWR5-NEXT: vsrh 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 2 -; PWR5-NEXT: vsubuhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 4 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vmladduhm 2, 2, 4, 3 -; PWR5-NEXT: vspltish 3, 8 -; PWR5-NEXT: vsrh 2, 2, 3 -; PWR5-NEXT: vspltish 3, 7 -; PWR5-NEXT: vcmpgtuh 2, 2, 3 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequh 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_7_v8i16: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI24_0@toc@ha -; PWR6-NEXT: vspltish 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI24_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI24_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI24_1@toc@l -; PWR6-NEXT: vsrh 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 2 -; PWR6-NEXT: vsubuhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 4 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vmladduhm 2, 2, 4, 3 -; PWR6-NEXT: vspltish 3, 8 -; PWR6-NEXT: vsrh 2, 2, 3 -; PWR6-NEXT: vspltish 3, 7 -; PWR6-NEXT: vcmpgtuh 2, 2, 3 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequh 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_7_v8i16: @@ -2408,62 +1882,44 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) { ; PWR5-LABEL: ult_8_v8i16: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI25_0@toc@ha -; PWR5-NEXT: vspltish 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI25_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI25_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI25_1@toc@l -; PWR5-NEXT: vsrh 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 2 -; PWR5-NEXT: vsubuhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 4 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vmladduhm 2, 2, 4, 3 -; PWR5-NEXT: vspltish 3, 8 -; PWR5-NEXT: vsrh 2, 2, 3 -; PWR5-NEXT: vcmpgtuh 2, 3, 2 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequh 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_8_v8i16: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI25_0@toc@ha -; PWR6-NEXT: vspltish 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI25_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI25_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI25_1@toc@l -; PWR6-NEXT: vsrh 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 2 -; PWR6-NEXT: vsubuhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 4 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vmladduhm 2, 2, 4, 3 -; PWR6-NEXT: vspltish 3, 8 -; PWR6-NEXT: vsrh 2, 2, 3 -; PWR6-NEXT: vcmpgtuh 2, 3, 2 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequh 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_8_v8i16: @@ -2518,62 +1974,50 @@ define <8 x i16> @ugt_8_v8i16(<8 x i16> %0) { ; PWR5-LABEL: ugt_8_v8i16: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI26_0@toc@ha -; PWR5-NEXT: vspltish 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI26_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI26_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI26_1@toc@l -; PWR5-NEXT: vsrh 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 2 -; PWR5-NEXT: vsubuhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 4 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vmladduhm 2, 2, 4, 3 -; PWR5-NEXT: vspltish 3, 8 -; PWR5-NEXT: vsrh 2, 2, 3 -; PWR5-NEXT: vcmpgtuh 2, 2, 3 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequh 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_8_v8i16: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI26_0@toc@ha -; PWR6-NEXT: vspltish 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI26_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI26_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI26_1@toc@l -; PWR6-NEXT: vsrh 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 2 -; PWR6-NEXT: vsubuhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 4 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vmladduhm 2, 2, 4, 3 -; PWR6-NEXT: vspltish 3, 8 -; PWR6-NEXT: vsrh 2, 2, 3 -; PWR6-NEXT: vcmpgtuh 2, 2, 3 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequh 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_8_v8i16: @@ -2628,64 +2072,48 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) { ; PWR5-LABEL: ult_9_v8i16: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI27_0@toc@ha -; PWR5-NEXT: vspltish 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI27_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI27_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI27_1@toc@l -; PWR5-NEXT: vsrh 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 2 -; PWR5-NEXT: vsubuhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: vspltish 5, 4 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vsrh 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduhm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vmladduhm 2, 2, 4, 3 -; PWR5-NEXT: vspltish 3, 8 -; PWR5-NEXT: vsrh 2, 2, 3 -; PWR5-NEXT: vspltish 3, 9 -; PWR5-NEXT: vcmpgtuh 2, 3, 2 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduhm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequh 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_9_v8i16: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI27_0@toc@ha -; PWR6-NEXT: vspltish 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI27_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI27_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI27_1@toc@l -; PWR6-NEXT: vsrh 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 2 -; PWR6-NEXT: vsubuhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: vspltish 5, 4 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vsrh 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduhm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vmladduhm 2, 2, 4, 3 -; PWR6-NEXT: vspltish 3, 8 -; PWR6-NEXT: vsrh 2, 2, 3 -; PWR6-NEXT: vspltish 3, 9 -; PWR6-NEXT: vcmpgtuh 2, 3, 2 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduhm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequh 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_9_v8i16: @@ -4194,74 +3622,26 @@ define <4 x i32> @ugt_2_v4i32(<4 x i32> %0) { ; PWR5-LABEL: ugt_2_v4i32: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI42_0@toc@ha -; PWR5-NEXT: vspltisw 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI42_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI42_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI42_1@toc@l -; PWR5-NEXT: vspltisw 0, 2 -; PWR5-NEXT: vsrw 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vsubuwm 2, 2, 4 -; PWR5-NEXT: vand 4, 2, 5 -; PWR5-NEXT: vsrw 2, 2, 0 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, 4 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduwm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, -16 -; PWR5-NEXT: vrlw 1, 4, 5 -; PWR5-NEXT: vmulouh 4, 2, 4 -; PWR5-NEXT: vmsumuhm 2, 2, 1, 3 -; PWR5-NEXT: vspltisw 3, 12 -; PWR5-NEXT: vadduwm 3, 3, 3 -; PWR5-NEXT: vslw 2, 2, 5 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 2, 2, 3 -; PWR5-NEXT: vcmpgtuw 2, 2, 0 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequw 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_2_v4i32: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI42_0@toc@ha -; PWR6-NEXT: vspltisw 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI42_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI42_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI42_1@toc@l -; PWR6-NEXT: vspltisw 0, 2 -; PWR6-NEXT: vsrw 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vsubuwm 2, 2, 4 -; PWR6-NEXT: vand 4, 2, 5 -; PWR6-NEXT: vsrw 2, 2, 0 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, 4 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduwm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, -16 -; PWR6-NEXT: vrlw 1, 4, 5 -; PWR6-NEXT: vmulouh 4, 2, 4 -; PWR6-NEXT: vmsumuhm 2, 2, 1, 3 -; PWR6-NEXT: vspltisw 3, 12 -; PWR6-NEXT: vadduwm 3, 3, 3 -; PWR6-NEXT: vslw 2, 2, 5 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 2, 2, 3 -; PWR6-NEXT: vcmpgtuw 2, 2, 0 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequw 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_2_v4i32: @@ -4322,76 +3702,24 @@ define <4 x i32> @ult_3_v4i32(<4 x i32> %0) { ; PWR5-LABEL: ult_3_v4i32: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI43_0@toc@ha -; PWR5-NEXT: vspltisw 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI43_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI43_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI43_1@toc@l -; PWR5-NEXT: vspltisw 0, 2 -; PWR5-NEXT: vsrw 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vsubuwm 2, 2, 4 -; PWR5-NEXT: vand 4, 2, 5 -; PWR5-NEXT: vsrw 2, 2, 0 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, 4 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduwm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, -16 -; PWR5-NEXT: vrlw 0, 4, 5 -; PWR5-NEXT: vmulouh 4, 2, 4 -; PWR5-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR5-NEXT: vspltisw 3, 12 -; PWR5-NEXT: vadduwm 3, 3, 3 -; PWR5-NEXT: vslw 2, 2, 5 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 2, 2, 3 -; PWR5-NEXT: vspltisw 3, 3 -; PWR5-NEXT: vcmpgtuw 2, 3, 2 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequw 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_3_v4i32: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI43_0@toc@ha -; PWR6-NEXT: vspltisw 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI43_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI43_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI43_1@toc@l -; PWR6-NEXT: vspltisw 0, 2 -; PWR6-NEXT: vsrw 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vsubuwm 2, 2, 4 -; PWR6-NEXT: vand 4, 2, 5 -; PWR6-NEXT: vsrw 2, 2, 0 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, 4 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduwm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, -16 -; PWR6-NEXT: vrlw 0, 4, 5 -; PWR6-NEXT: vmulouh 4, 2, 4 -; PWR6-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR6-NEXT: vspltisw 3, 12 -; PWR6-NEXT: vadduwm 3, 3, 3 -; PWR6-NEXT: vslw 2, 2, 5 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 2, 2, 3 -; PWR6-NEXT: vspltisw 3, 3 -; PWR6-NEXT: vcmpgtuw 2, 3, 2 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequw 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_3_v4i32: @@ -4453,76 +3781,30 @@ define <4 x i32> @ugt_3_v4i32(<4 x i32> %0) { ; PWR5-LABEL: ugt_3_v4i32: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI44_0@toc@ha -; PWR5-NEXT: vspltisw 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI44_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI44_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI44_1@toc@l -; PWR5-NEXT: vspltisw 0, 2 -; PWR5-NEXT: vsrw 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vsubuwm 2, 2, 4 -; PWR5-NEXT: vand 4, 2, 5 -; PWR5-NEXT: vsrw 2, 2, 0 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, 4 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduwm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, -16 -; PWR5-NEXT: vrlw 0, 4, 5 -; PWR5-NEXT: vmulouh 4, 2, 4 -; PWR5-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR5-NEXT: vspltisw 3, 12 -; PWR5-NEXT: vadduwm 3, 3, 3 -; PWR5-NEXT: vslw 2, 2, 5 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 2, 2, 3 -; PWR5-NEXT: vspltisw 3, 3 -; PWR5-NEXT: vcmpgtuw 2, 2, 3 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequw 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_3_v4i32: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI44_0@toc@ha -; PWR6-NEXT: vspltisw 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI44_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI44_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI44_1@toc@l -; PWR6-NEXT: vspltisw 0, 2 -; PWR6-NEXT: vsrw 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vsubuwm 2, 2, 4 -; PWR6-NEXT: vand 4, 2, 5 -; PWR6-NEXT: vsrw 2, 2, 0 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, 4 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduwm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, -16 -; PWR6-NEXT: vrlw 0, 4, 5 -; PWR6-NEXT: vmulouh 4, 2, 4 -; PWR6-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR6-NEXT: vspltisw 3, 12 -; PWR6-NEXT: vadduwm 3, 3, 3 -; PWR6-NEXT: vslw 2, 2, 5 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 2, 2, 3 -; PWR6-NEXT: vspltisw 3, 3 -; PWR6-NEXT: vcmpgtuw 2, 2, 3 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequw 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_3_v4i32: @@ -4584,74 +3866,28 @@ define <4 x i32> @ult_4_v4i32(<4 x i32> %0) { ; PWR5-LABEL: ult_4_v4i32: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI45_0@toc@ha -; PWR5-NEXT: vspltisw 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI45_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI45_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI45_1@toc@l -; PWR5-NEXT: vspltisw 0, 2 -; PWR5-NEXT: vsrw 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vsubuwm 2, 2, 4 -; PWR5-NEXT: vand 4, 2, 5 -; PWR5-NEXT: vsrw 2, 2, 0 -; PWR5-NEXT: vspltisb 0, 15 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, 4 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 4, 2, 5 -; PWR5-NEXT: vadduwm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 0 -; PWR5-NEXT: vspltisw 0, -16 -; PWR5-NEXT: vrlw 1, 4, 0 -; PWR5-NEXT: vmulouh 4, 2, 4 -; PWR5-NEXT: vmsumuhm 2, 2, 1, 3 -; PWR5-NEXT: vspltisw 3, 12 -; PWR5-NEXT: vadduwm 3, 3, 3 -; PWR5-NEXT: vslw 2, 2, 0 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 2, 2, 3 -; PWR5-NEXT: vcmpgtuw 2, 5, 2 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequw 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_4_v4i32: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI45_0@toc@ha -; PWR6-NEXT: vspltisw 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI45_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI45_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI45_1@toc@l -; PWR6-NEXT: vspltisw 0, 2 -; PWR6-NEXT: vsrw 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vsubuwm 2, 2, 4 -; PWR6-NEXT: vand 4, 2, 5 -; PWR6-NEXT: vsrw 2, 2, 0 -; PWR6-NEXT: vspltisb 0, 15 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, 4 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 4, 2, 5 -; PWR6-NEXT: vadduwm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 0 -; PWR6-NEXT: vspltisw 0, -16 -; PWR6-NEXT: vrlw 1, 4, 0 -; PWR6-NEXT: vmulouh 4, 2, 4 -; PWR6-NEXT: vmsumuhm 2, 2, 1, 3 -; PWR6-NEXT: vspltisw 3, 12 -; PWR6-NEXT: vadduwm 3, 3, 3 -; PWR6-NEXT: vslw 2, 2, 0 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 2, 2, 3 -; PWR6-NEXT: vcmpgtuw 2, 5, 2 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequw 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_4_v4i32: @@ -4712,74 +3948,34 @@ define <4 x i32> @ugt_4_v4i32(<4 x i32> %0) { ; PWR5-LABEL: ugt_4_v4i32: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI46_0@toc@ha -; PWR5-NEXT: vspltisw 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI46_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI46_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI46_1@toc@l -; PWR5-NEXT: vspltisw 0, 2 -; PWR5-NEXT: vsrw 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vsubuwm 2, 2, 4 -; PWR5-NEXT: vand 4, 2, 5 -; PWR5-NEXT: vsrw 2, 2, 0 -; PWR5-NEXT: vspltisb 0, 15 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, 4 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 4, 2, 5 -; PWR5-NEXT: vadduwm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 0 -; PWR5-NEXT: vspltisw 0, -16 -; PWR5-NEXT: vrlw 1, 4, 0 -; PWR5-NEXT: vmulouh 4, 2, 4 -; PWR5-NEXT: vmsumuhm 2, 2, 1, 3 -; PWR5-NEXT: vspltisw 3, 12 -; PWR5-NEXT: vadduwm 3, 3, 3 -; PWR5-NEXT: vslw 2, 2, 0 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 2, 2, 3 -; PWR5-NEXT: vcmpgtuw 2, 2, 5 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequw 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_4_v4i32: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI46_0@toc@ha -; PWR6-NEXT: vspltisw 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI46_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI46_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI46_1@toc@l -; PWR6-NEXT: vspltisw 0, 2 -; PWR6-NEXT: vsrw 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vsubuwm 2, 2, 4 -; PWR6-NEXT: vand 4, 2, 5 -; PWR6-NEXT: vsrw 2, 2, 0 -; PWR6-NEXT: vspltisb 0, 15 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, 4 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 4, 2, 5 -; PWR6-NEXT: vadduwm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 0 -; PWR6-NEXT: vspltisw 0, -16 -; PWR6-NEXT: vrlw 1, 4, 0 -; PWR6-NEXT: vmulouh 4, 2, 4 -; PWR6-NEXT: vmsumuhm 2, 2, 1, 3 -; PWR6-NEXT: vspltisw 3, 12 -; PWR6-NEXT: vadduwm 3, 3, 3 -; PWR6-NEXT: vslw 2, 2, 0 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 2, 2, 3 -; PWR6-NEXT: vcmpgtuw 2, 2, 5 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequw 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_4_v4i32: @@ -4840,76 +4036,32 @@ define <4 x i32> @ult_5_v4i32(<4 x i32> %0) { ; PWR5-LABEL: ult_5_v4i32: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI47_0@toc@ha -; PWR5-NEXT: vspltisw 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI47_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI47_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI47_1@toc@l -; PWR5-NEXT: vspltisw 0, 2 -; PWR5-NEXT: vsrw 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vsubuwm 2, 2, 4 -; PWR5-NEXT: vand 4, 2, 5 -; PWR5-NEXT: vsrw 2, 2, 0 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, 4 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduwm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, -16 -; PWR5-NEXT: vrlw 0, 4, 5 -; PWR5-NEXT: vmulouh 4, 2, 4 -; PWR5-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR5-NEXT: vspltisw 3, 12 -; PWR5-NEXT: vadduwm 3, 3, 3 -; PWR5-NEXT: vslw 2, 2, 5 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 2, 2, 3 -; PWR5-NEXT: vspltisw 3, 5 -; PWR5-NEXT: vcmpgtuw 2, 3, 2 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequw 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_5_v4i32: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI47_0@toc@ha -; PWR6-NEXT: vspltisw 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI47_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI47_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI47_1@toc@l -; PWR6-NEXT: vspltisw 0, 2 -; PWR6-NEXT: vsrw 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vsubuwm 2, 2, 4 -; PWR6-NEXT: vand 4, 2, 5 -; PWR6-NEXT: vsrw 2, 2, 0 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, 4 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduwm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, -16 -; PWR6-NEXT: vrlw 0, 4, 5 -; PWR6-NEXT: vmulouh 4, 2, 4 -; PWR6-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR6-NEXT: vspltisw 3, 12 -; PWR6-NEXT: vadduwm 3, 3, 3 -; PWR6-NEXT: vslw 2, 2, 5 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 2, 2, 3 -; PWR6-NEXT: vspltisw 3, 5 -; PWR6-NEXT: vcmpgtuw 2, 3, 2 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequw 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_5_v4i32: @@ -4971,76 +4123,38 @@ define <4 x i32> @ugt_5_v4i32(<4 x i32> %0) { ; PWR5-LABEL: ugt_5_v4i32: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI48_0@toc@ha -; PWR5-NEXT: vspltisw 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI48_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI48_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI48_1@toc@l -; PWR5-NEXT: vspltisw 0, 2 -; PWR5-NEXT: vsrw 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vsubuwm 2, 2, 4 -; PWR5-NEXT: vand 4, 2, 5 -; PWR5-NEXT: vsrw 2, 2, 0 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, 4 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduwm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, -16 -; PWR5-NEXT: vrlw 0, 4, 5 -; PWR5-NEXT: vmulouh 4, 2, 4 -; PWR5-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR5-NEXT: vspltisw 3, 12 -; PWR5-NEXT: vadduwm 3, 3, 3 -; PWR5-NEXT: vslw 2, 2, 5 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 2, 2, 3 -; PWR5-NEXT: vspltisw 3, 5 -; PWR5-NEXT: vcmpgtuw 2, 2, 3 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequw 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_5_v4i32: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI48_0@toc@ha -; PWR6-NEXT: vspltisw 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI48_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI48_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI48_1@toc@l -; PWR6-NEXT: vspltisw 0, 2 -; PWR6-NEXT: vsrw 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vsubuwm 2, 2, 4 -; PWR6-NEXT: vand 4, 2, 5 -; PWR6-NEXT: vsrw 2, 2, 0 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, 4 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduwm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, -16 -; PWR6-NEXT: vrlw 0, 4, 5 -; PWR6-NEXT: vmulouh 4, 2, 4 -; PWR6-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR6-NEXT: vspltisw 3, 12 -; PWR6-NEXT: vadduwm 3, 3, 3 -; PWR6-NEXT: vslw 2, 2, 5 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 2, 2, 3 -; PWR6-NEXT: vspltisw 3, 5 -; PWR6-NEXT: vcmpgtuw 2, 2, 3 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequw 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_5_v4i32: @@ -5102,76 +4216,36 @@ define <4 x i32> @ult_6_v4i32(<4 x i32> %0) { ; PWR5-LABEL: ult_6_v4i32: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI49_0@toc@ha -; PWR5-NEXT: vspltisw 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI49_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI49_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI49_1@toc@l -; PWR5-NEXT: vspltisw 0, 2 -; PWR5-NEXT: vsrw 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vsubuwm 2, 2, 4 -; PWR5-NEXT: vand 4, 2, 5 -; PWR5-NEXT: vsrw 2, 2, 0 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, 4 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduwm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, -16 -; PWR5-NEXT: vrlw 0, 4, 5 -; PWR5-NEXT: vmulouh 4, 2, 4 -; PWR5-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR5-NEXT: vspltisw 3, 12 -; PWR5-NEXT: vadduwm 3, 3, 3 -; PWR5-NEXT: vslw 2, 2, 5 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 2, 2, 3 -; PWR5-NEXT: vspltisw 3, 6 -; PWR5-NEXT: vcmpgtuw 2, 3, 2 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequw 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_6_v4i32: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI49_0@toc@ha -; PWR6-NEXT: vspltisw 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI49_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI49_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI49_1@toc@l -; PWR6-NEXT: vspltisw 0, 2 -; PWR6-NEXT: vsrw 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vsubuwm 2, 2, 4 -; PWR6-NEXT: vand 4, 2, 5 -; PWR6-NEXT: vsrw 2, 2, 0 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, 4 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduwm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, -16 -; PWR6-NEXT: vrlw 0, 4, 5 -; PWR6-NEXT: vmulouh 4, 2, 4 -; PWR6-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR6-NEXT: vspltisw 3, 12 -; PWR6-NEXT: vadduwm 3, 3, 3 -; PWR6-NEXT: vslw 2, 2, 5 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 2, 2, 3 -; PWR6-NEXT: vspltisw 3, 6 -; PWR6-NEXT: vcmpgtuw 2, 3, 2 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequw 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_6_v4i32: @@ -5233,76 +4307,42 @@ define <4 x i32> @ugt_6_v4i32(<4 x i32> %0) { ; PWR5-LABEL: ugt_6_v4i32: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI50_0@toc@ha -; PWR5-NEXT: vspltisw 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI50_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI50_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI50_1@toc@l -; PWR5-NEXT: vspltisw 0, 2 -; PWR5-NEXT: vsrw 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vsubuwm 2, 2, 4 -; PWR5-NEXT: vand 4, 2, 5 -; PWR5-NEXT: vsrw 2, 2, 0 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, 4 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduwm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, -16 -; PWR5-NEXT: vrlw 0, 4, 5 -; PWR5-NEXT: vmulouh 4, 2, 4 -; PWR5-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR5-NEXT: vspltisw 3, 12 -; PWR5-NEXT: vadduwm 3, 3, 3 -; PWR5-NEXT: vslw 2, 2, 5 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 2, 2, 3 -; PWR5-NEXT: vspltisw 3, 6 -; PWR5-NEXT: vcmpgtuw 2, 2, 3 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequw 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_6_v4i32: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI50_0@toc@ha -; PWR6-NEXT: vspltisw 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI50_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI50_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI50_1@toc@l -; PWR6-NEXT: vspltisw 0, 2 -; PWR6-NEXT: vsrw 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vsubuwm 2, 2, 4 -; PWR6-NEXT: vand 4, 2, 5 -; PWR6-NEXT: vsrw 2, 2, 0 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, 4 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduwm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, -16 -; PWR6-NEXT: vrlw 0, 4, 5 -; PWR6-NEXT: vmulouh 4, 2, 4 -; PWR6-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR6-NEXT: vspltisw 3, 12 -; PWR6-NEXT: vadduwm 3, 3, 3 -; PWR6-NEXT: vslw 2, 2, 5 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 2, 2, 3 -; PWR6-NEXT: vspltisw 3, 6 -; PWR6-NEXT: vcmpgtuw 2, 2, 3 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequw 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_6_v4i32: @@ -5364,76 +4404,40 @@ define <4 x i32> @ult_7_v4i32(<4 x i32> %0) { ; PWR5-LABEL: ult_7_v4i32: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI51_0@toc@ha -; PWR5-NEXT: vspltisw 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI51_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI51_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI51_1@toc@l -; PWR5-NEXT: vspltisw 0, 2 -; PWR5-NEXT: vsrw 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vsubuwm 2, 2, 4 -; PWR5-NEXT: vand 4, 2, 5 -; PWR5-NEXT: vsrw 2, 2, 0 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, 4 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduwm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, -16 -; PWR5-NEXT: vrlw 0, 4, 5 -; PWR5-NEXT: vmulouh 4, 2, 4 -; PWR5-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR5-NEXT: vspltisw 3, 12 -; PWR5-NEXT: vadduwm 3, 3, 3 -; PWR5-NEXT: vslw 2, 2, 5 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 2, 2, 3 -; PWR5-NEXT: vspltisw 3, 7 -; PWR5-NEXT: vcmpgtuw 2, 3, 2 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequw 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_7_v4i32: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI51_0@toc@ha -; PWR6-NEXT: vspltisw 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI51_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI51_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI51_1@toc@l -; PWR6-NEXT: vspltisw 0, 2 -; PWR6-NEXT: vsrw 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vsubuwm 2, 2, 4 -; PWR6-NEXT: vand 4, 2, 5 -; PWR6-NEXT: vsrw 2, 2, 0 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, 4 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduwm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, -16 -; PWR6-NEXT: vrlw 0, 4, 5 -; PWR6-NEXT: vmulouh 4, 2, 4 -; PWR6-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR6-NEXT: vspltisw 3, 12 -; PWR6-NEXT: vadduwm 3, 3, 3 -; PWR6-NEXT: vslw 2, 2, 5 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 2, 2, 3 -; PWR6-NEXT: vspltisw 3, 7 -; PWR6-NEXT: vcmpgtuw 2, 3, 2 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequw 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_7_v4i32: @@ -5495,76 +4499,46 @@ define <4 x i32> @ugt_7_v4i32(<4 x i32> %0) { ; PWR5-LABEL: ugt_7_v4i32: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI52_0@toc@ha -; PWR5-NEXT: vspltisw 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI52_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI52_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI52_1@toc@l -; PWR5-NEXT: vspltisw 0, 2 -; PWR5-NEXT: vsrw 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vsubuwm 2, 2, 4 -; PWR5-NEXT: vand 4, 2, 5 -; PWR5-NEXT: vsrw 2, 2, 0 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, 4 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduwm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, -16 -; PWR5-NEXT: vrlw 0, 4, 5 -; PWR5-NEXT: vmulouh 4, 2, 4 -; PWR5-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR5-NEXT: vspltisw 3, 12 -; PWR5-NEXT: vadduwm 3, 3, 3 -; PWR5-NEXT: vslw 2, 2, 5 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 2, 2, 3 -; PWR5-NEXT: vspltisw 3, 7 -; PWR5-NEXT: vcmpgtuw 2, 2, 3 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequw 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_7_v4i32: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI52_0@toc@ha -; PWR6-NEXT: vspltisw 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI52_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI52_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI52_1@toc@l -; PWR6-NEXT: vspltisw 0, 2 -; PWR6-NEXT: vsrw 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vsubuwm 2, 2, 4 -; PWR6-NEXT: vand 4, 2, 5 -; PWR6-NEXT: vsrw 2, 2, 0 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, 4 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduwm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, -16 -; PWR6-NEXT: vrlw 0, 4, 5 -; PWR6-NEXT: vmulouh 4, 2, 4 -; PWR6-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR6-NEXT: vspltisw 3, 12 -; PWR6-NEXT: vadduwm 3, 3, 3 -; PWR6-NEXT: vslw 2, 2, 5 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 2, 2, 3 -; PWR6-NEXT: vspltisw 3, 7 -; PWR6-NEXT: vcmpgtuw 2, 2, 3 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequw 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_7_v4i32: @@ -5626,76 +4600,44 @@ define <4 x i32> @ult_8_v4i32(<4 x i32> %0) { ; PWR5-LABEL: ult_8_v4i32: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI53_0@toc@ha -; PWR5-NEXT: vspltisw 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI53_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI53_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI53_1@toc@l -; PWR5-NEXT: vspltisw 0, 2 -; PWR5-NEXT: vsrw 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vsubuwm 2, 2, 4 -; PWR5-NEXT: vand 4, 2, 5 -; PWR5-NEXT: vsrw 2, 2, 0 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, 4 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduwm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, -16 -; PWR5-NEXT: vrlw 0, 4, 5 -; PWR5-NEXT: vmulouh 4, 2, 4 -; PWR5-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR5-NEXT: vspltisw 3, 12 -; PWR5-NEXT: vadduwm 3, 3, 3 -; PWR5-NEXT: vslw 2, 2, 5 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 2, 2, 3 -; PWR5-NEXT: vspltisw 3, 8 -; PWR5-NEXT: vcmpgtuw 2, 3, 2 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequw 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_8_v4i32: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI53_0@toc@ha -; PWR6-NEXT: vspltisw 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI53_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI53_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI53_1@toc@l -; PWR6-NEXT: vspltisw 0, 2 -; PWR6-NEXT: vsrw 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vsubuwm 2, 2, 4 -; PWR6-NEXT: vand 4, 2, 5 -; PWR6-NEXT: vsrw 2, 2, 0 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, 4 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduwm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, -16 -; PWR6-NEXT: vrlw 0, 4, 5 -; PWR6-NEXT: vmulouh 4, 2, 4 -; PWR6-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR6-NEXT: vspltisw 3, 12 -; PWR6-NEXT: vadduwm 3, 3, 3 -; PWR6-NEXT: vslw 2, 2, 5 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 2, 2, 3 -; PWR6-NEXT: vspltisw 3, 8 -; PWR6-NEXT: vcmpgtuw 2, 3, 2 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequw 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_8_v4i32: @@ -5757,76 +4699,50 @@ define <4 x i32> @ugt_8_v4i32(<4 x i32> %0) { ; PWR5-LABEL: ugt_8_v4i32: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI54_0@toc@ha -; PWR5-NEXT: vspltisw 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI54_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI54_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI54_1@toc@l -; PWR5-NEXT: vspltisw 0, 2 -; PWR5-NEXT: vsrw 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vsubuwm 2, 2, 4 -; PWR5-NEXT: vand 4, 2, 5 -; PWR5-NEXT: vsrw 2, 2, 0 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, 4 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduwm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, -16 -; PWR5-NEXT: vrlw 0, 4, 5 -; PWR5-NEXT: vmulouh 4, 2, 4 -; PWR5-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR5-NEXT: vspltisw 3, 12 -; PWR5-NEXT: vadduwm 3, 3, 3 -; PWR5-NEXT: vslw 2, 2, 5 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 2, 2, 3 -; PWR5-NEXT: vspltisw 3, 8 -; PWR5-NEXT: vcmpgtuw 2, 2, 3 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequw 2, 2, 4 +; PWR5-NEXT: vnot 2, 2 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_8_v4i32: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI54_0@toc@ha -; PWR6-NEXT: vspltisw 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI54_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI54_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI54_1@toc@l -; PWR6-NEXT: vspltisw 0, 2 -; PWR6-NEXT: vsrw 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vsubuwm 2, 2, 4 -; PWR6-NEXT: vand 4, 2, 5 -; PWR6-NEXT: vsrw 2, 2, 0 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, 4 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduwm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, -16 -; PWR6-NEXT: vrlw 0, 4, 5 -; PWR6-NEXT: vmulouh 4, 2, 4 -; PWR6-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR6-NEXT: vspltisw 3, 12 -; PWR6-NEXT: vadduwm 3, 3, 3 -; PWR6-NEXT: vslw 2, 2, 5 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 2, 2, 3 -; PWR6-NEXT: vspltisw 3, 8 -; PWR6-NEXT: vcmpgtuw 2, 2, 3 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequw 2, 2, 4 +; PWR6-NEXT: vnot 2, 2 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_8_v4i32: @@ -5888,76 +4804,48 @@ define <4 x i32> @ult_9_v4i32(<4 x i32> %0) { ; PWR5-LABEL: ult_9_v4i32: ; PWR5: # %bb.0: -; PWR5-NEXT: addis 3, 2, .LCPI55_0@toc@ha -; PWR5-NEXT: vspltisw 4, 1 -; PWR5-NEXT: vxor 3, 3, 3 -; PWR5-NEXT: addi 3, 3, .LCPI55_0@toc@l -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: addis 3, 2, .LCPI55_1@toc@ha -; PWR5-NEXT: addi 3, 3, .LCPI55_1@toc@l -; PWR5-NEXT: vspltisw 0, 2 -; PWR5-NEXT: vsrw 4, 2, 4 -; PWR5-NEXT: vand 4, 4, 5 -; PWR5-NEXT: lvx 5, 0, 3 -; PWR5-NEXT: vsubuwm 2, 2, 4 -; PWR5-NEXT: vand 4, 2, 5 -; PWR5-NEXT: vsrw 2, 2, 0 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, 4 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 4, 2, 5 -; PWR5-NEXT: vspltisb 5, 15 -; PWR5-NEXT: vadduwm 2, 2, 4 -; PWR5-NEXT: vspltisb 4, 1 -; PWR5-NEXT: vand 2, 2, 5 -; PWR5-NEXT: vspltisw 5, -16 -; PWR5-NEXT: vrlw 0, 4, 5 -; PWR5-NEXT: vmulouh 4, 2, 4 -; PWR5-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR5-NEXT: vspltisw 3, 12 -; PWR5-NEXT: vadduwm 3, 3, 3 -; PWR5-NEXT: vslw 2, 2, 5 -; PWR5-NEXT: vadduwm 2, 4, 2 -; PWR5-NEXT: vsrw 2, 2, 3 -; PWR5-NEXT: vspltisw 3, 9 -; PWR5-NEXT: vcmpgtuw 2, 3, 2 +; PWR5-NEXT: vspltisb 3, -1 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 4, 2, 3 +; PWR5-NEXT: vand 2, 2, 4 +; PWR5-NEXT: vadduwm 3, 2, 3 +; PWR5-NEXT: vxor 4, 4, 4 +; PWR5-NEXT: vand 2, 2, 3 +; PWR5-NEXT: vcmpequw 2, 2, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_9_v4i32: ; PWR6: # %bb.0: -; PWR6-NEXT: addis 3, 2, .LCPI55_0@toc@ha -; PWR6-NEXT: vspltisw 4, 1 -; PWR6-NEXT: vxor 3, 3, 3 -; PWR6-NEXT: addi 3, 3, .LCPI55_0@toc@l -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: addis 3, 2, .LCPI55_1@toc@ha -; PWR6-NEXT: addi 3, 3, .LCPI55_1@toc@l -; PWR6-NEXT: vspltisw 0, 2 -; PWR6-NEXT: vsrw 4, 2, 4 -; PWR6-NEXT: vand 4, 4, 5 -; PWR6-NEXT: lvx 5, 0, 3 -; PWR6-NEXT: vsubuwm 2, 2, 4 -; PWR6-NEXT: vand 4, 2, 5 -; PWR6-NEXT: vsrw 2, 2, 0 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, 4 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 4, 2, 5 -; PWR6-NEXT: vspltisb 5, 15 -; PWR6-NEXT: vadduwm 2, 2, 4 -; PWR6-NEXT: vspltisb 4, 1 -; PWR6-NEXT: vand 2, 2, 5 -; PWR6-NEXT: vspltisw 5, -16 -; PWR6-NEXT: vrlw 0, 4, 5 -; PWR6-NEXT: vmulouh 4, 2, 4 -; PWR6-NEXT: vmsumuhm 2, 2, 0, 3 -; PWR6-NEXT: vspltisw 3, 12 -; PWR6-NEXT: vadduwm 3, 3, 3 -; PWR6-NEXT: vslw 2, 2, 5 -; PWR6-NEXT: vadduwm 2, 4, 2 -; PWR6-NEXT: vsrw 2, 2, 3 -; PWR6-NEXT: vspltisw 3, 9 -; PWR6-NEXT: vcmpgtuw 2, 3, 2 +; PWR6-NEXT: vspltisb 3, -1 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 4, 2, 3 +; PWR6-NEXT: vand 2, 2, 4 +; PWR6-NEXT: vadduwm 3, 2, 3 +; PWR6-NEXT: vxor 4, 4, 4 +; PWR6-NEXT: vand 2, 2, 3 +; PWR6-NEXT: vcmpequw 2, 2, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ult_9_v4i32: @@ -12079,92 +10967,34 @@ define <2 x i64> @ugt_2_v2i64(<2 x i64> %0) { ; PWR5-LABEL: ugt_2_v2i64: ; PWR5: # %bb.0: -; PWR5-NEXT: lis 5, 21845 -; PWR5-NEXT: lis 6, 13107 -; PWR5-NEXT: ori 5, 5, 21845 -; PWR5-NEXT: rotldi 8, 4, 63 -; PWR5-NEXT: rotldi 9, 3, 63 -; PWR5-NEXT: rldimi 5, 5, 32, 0 -; PWR5-NEXT: and 8, 8, 5 -; PWR5-NEXT: and 5, 9, 5 -; PWR5-NEXT: ori 6, 6, 13107 -; PWR5-NEXT: sub 3, 3, 5 -; PWR5-NEXT: rldimi 6, 6, 32, 0 -; PWR5-NEXT: sub 4, 4, 8 -; PWR5-NEXT: and 8, 3, 6 -; PWR5-NEXT: rotldi 3, 3, 62 -; PWR5-NEXT: and 3, 3, 6 -; PWR5-NEXT: lis 7, 3855 -; PWR5-NEXT: and 5, 4, 6 -; PWR5-NEXT: rotldi 4, 4, 62 -; PWR5-NEXT: add 3, 8, 3 -; PWR5-NEXT: lis 9, 257 -; PWR5-NEXT: ori 7, 7, 3855 -; PWR5-NEXT: and 4, 4, 6 -; PWR5-NEXT: rldicl 6, 3, 60, 4 -; PWR5-NEXT: ori 9, 9, 257 -; PWR5-NEXT: rldimi 7, 7, 32, 0 -; PWR5-NEXT: add 4, 5, 4 -; PWR5-NEXT: add 3, 3, 6 -; PWR5-NEXT: rldimi 9, 9, 32, 0 -; PWR5-NEXT: rldicl 5, 4, 60, 4 -; PWR5-NEXT: and 3, 3, 7 -; PWR5-NEXT: add 4, 4, 5 -; PWR5-NEXT: mulld 3, 3, 9 -; PWR5-NEXT: and 4, 4, 7 -; PWR5-NEXT: rldicl 3, 3, 8, 56 -; PWR5-NEXT: mulld 4, 4, 9 -; PWR5-NEXT: li 5, 2 -; PWR5-NEXT: subfic 3, 3, 2 -; PWR5-NEXT: rldicl 4, 4, 8, 56 -; PWR5-NEXT: subfe 3, 5, 5 -; PWR5-NEXT: subfic 4, 4, 2 -; PWR5-NEXT: subfe 4, 5, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: subfic 3, 3, 0 +; PWR5-NEXT: subfe 3, 3, 3 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: subfic 4, 4, 0 +; PWR5-NEXT: subfe 4, 4, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_2_v2i64: ; PWR6: # %bb.0: -; PWR6-NEXT: lis 5, 21845 -; PWR6-NEXT: lis 6, 13107 -; PWR6-NEXT: ori 5, 5, 21845 -; PWR6-NEXT: rotldi 8, 4, 63 -; PWR6-NEXT: rotldi 9, 3, 63 -; PWR6-NEXT: rldimi 5, 5, 32, 0 -; PWR6-NEXT: and 8, 8, 5 -; PWR6-NEXT: and 5, 9, 5 -; PWR6-NEXT: ori 6, 6, 13107 -; PWR6-NEXT: sub 3, 3, 5 -; PWR6-NEXT: rldimi 6, 6, 32, 0 -; PWR6-NEXT: sub 4, 4, 8 -; PWR6-NEXT: and 8, 3, 6 -; PWR6-NEXT: rotldi 3, 3, 62 -; PWR6-NEXT: and 3, 3, 6 -; PWR6-NEXT: lis 7, 3855 -; PWR6-NEXT: and 5, 4, 6 -; PWR6-NEXT: rotldi 4, 4, 62 -; PWR6-NEXT: add 3, 8, 3 -; PWR6-NEXT: lis 9, 257 -; PWR6-NEXT: ori 7, 7, 3855 -; PWR6-NEXT: and 4, 4, 6 -; PWR6-NEXT: rldicl 6, 3, 60, 4 -; PWR6-NEXT: ori 9, 9, 257 -; PWR6-NEXT: rldimi 7, 7, 32, 0 -; PWR6-NEXT: add 4, 5, 4 -; PWR6-NEXT: add 3, 3, 6 -; PWR6-NEXT: rldimi 9, 9, 32, 0 -; PWR6-NEXT: rldicl 5, 4, 60, 4 -; PWR6-NEXT: and 3, 3, 7 -; PWR6-NEXT: add 4, 4, 5 -; PWR6-NEXT: mulld 3, 3, 9 -; PWR6-NEXT: and 4, 4, 7 -; PWR6-NEXT: rldicl 3, 3, 8, 56 -; PWR6-NEXT: mulld 4, 4, 9 -; PWR6-NEXT: li 5, 2 -; PWR6-NEXT: subfic 3, 3, 2 -; PWR6-NEXT: rldicl 4, 4, 8, 56 -; PWR6-NEXT: subfe 3, 5, 5 -; PWR6-NEXT: subfic 4, 4, 2 -; PWR6-NEXT: subfe 4, 5, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: subfic 3, 3, 0 +; PWR6-NEXT: subfe 3, 3, 3 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: subfic 4, 4, 0 +; PWR6-NEXT: subfe 4, 4, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_2_v2i64: @@ -12213,91 +11043,33 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; PWR5-LABEL: ult_3_v2i64: ; PWR5: # %bb.0: -; PWR5-NEXT: lis 5, 21845 -; PWR5-NEXT: lis 6, 13107 -; PWR5-NEXT: ori 5, 5, 21845 -; PWR5-NEXT: rotldi 8, 4, 63 -; PWR5-NEXT: rotldi 9, 3, 63 -; PWR5-NEXT: rldimi 5, 5, 32, 0 -; PWR5-NEXT: and 8, 8, 5 -; PWR5-NEXT: and 5, 9, 5 -; PWR5-NEXT: ori 6, 6, 13107 -; PWR5-NEXT: sub 3, 3, 5 -; PWR5-NEXT: rldimi 6, 6, 32, 0 -; PWR5-NEXT: sub 4, 4, 8 -; PWR5-NEXT: and 8, 3, 6 -; PWR5-NEXT: rotldi 3, 3, 62 -; PWR5-NEXT: and 3, 3, 6 -; PWR5-NEXT: lis 7, 3855 -; PWR5-NEXT: and 5, 4, 6 -; PWR5-NEXT: rotldi 4, 4, 62 -; PWR5-NEXT: add 3, 8, 3 -; PWR5-NEXT: lis 9, 257 -; PWR5-NEXT: ori 7, 7, 3855 -; PWR5-NEXT: and 4, 4, 6 -; PWR5-NEXT: rldicl 6, 3, 60, 4 -; PWR5-NEXT: ori 9, 9, 257 -; PWR5-NEXT: rldimi 7, 7, 32, 0 -; PWR5-NEXT: add 4, 5, 4 -; PWR5-NEXT: add 3, 3, 6 -; PWR5-NEXT: rldimi 9, 9, 32, 0 -; PWR5-NEXT: rldicl 5, 4, 60, 4 -; PWR5-NEXT: and 3, 3, 7 -; PWR5-NEXT: add 4, 4, 5 -; PWR5-NEXT: mulld 3, 3, 9 -; PWR5-NEXT: and 4, 4, 7 -; PWR5-NEXT: rldicl 3, 3, 8, 56 -; PWR5-NEXT: li 5, 3 -; PWR5-NEXT: mulld 4, 4, 9 -; PWR5-NEXT: subc 6, 3, 5 -; PWR5-NEXT: rldicl 4, 4, 8, 56 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: addic 3, 3, -1 ; PWR5-NEXT: subfe 3, 3, 3 -; PWR5-NEXT: subc 5, 4, 5 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addic 4, 4, -1 ; PWR5-NEXT: subfe 4, 4, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_3_v2i64: ; PWR6: # %bb.0: -; PWR6-NEXT: lis 5, 21845 -; PWR6-NEXT: lis 6, 13107 -; PWR6-NEXT: ori 5, 5, 21845 -; PWR6-NEXT: rotldi 8, 4, 63 -; PWR6-NEXT: rotldi 9, 3, 63 -; PWR6-NEXT: rldimi 5, 5, 32, 0 -; PWR6-NEXT: and 8, 8, 5 -; PWR6-NEXT: and 5, 9, 5 -; PWR6-NEXT: ori 6, 6, 13107 -; PWR6-NEXT: sub 3, 3, 5 -; PWR6-NEXT: rldimi 6, 6, 32, 0 -; PWR6-NEXT: sub 4, 4, 8 -; PWR6-NEXT: and 8, 3, 6 -; PWR6-NEXT: rotldi 3, 3, 62 -; PWR6-NEXT: and 3, 3, 6 -; PWR6-NEXT: lis 7, 3855 -; PWR6-NEXT: and 5, 4, 6 -; PWR6-NEXT: rotldi 4, 4, 62 -; PWR6-NEXT: add 3, 8, 3 -; PWR6-NEXT: lis 9, 257 -; PWR6-NEXT: ori 7, 7, 3855 -; PWR6-NEXT: and 4, 4, 6 -; PWR6-NEXT: rldicl 6, 3, 60, 4 -; PWR6-NEXT: ori 9, 9, 257 -; PWR6-NEXT: rldimi 7, 7, 32, 0 -; PWR6-NEXT: add 4, 5, 4 -; PWR6-NEXT: add 3, 3, 6 -; PWR6-NEXT: rldimi 9, 9, 32, 0 -; PWR6-NEXT: rldicl 5, 4, 60, 4 -; PWR6-NEXT: and 3, 3, 7 -; PWR6-NEXT: add 4, 4, 5 -; PWR6-NEXT: mulld 3, 3, 9 -; PWR6-NEXT: and 4, 4, 7 -; PWR6-NEXT: rldicl 3, 3, 8, 56 -; PWR6-NEXT: li 5, 3 -; PWR6-NEXT: mulld 4, 4, 9 -; PWR6-NEXT: subc 6, 3, 5 -; PWR6-NEXT: rldicl 4, 4, 8, 56 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: addic 3, 3, -1 ; PWR6-NEXT: subfe 3, 3, 3 -; PWR6-NEXT: subc 5, 4, 5 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addic 4, 4, -1 ; PWR6-NEXT: subfe 4, 4, 4 ; PWR6-NEXT: blr ; @@ -12347,92 +11119,42 @@ define <2 x i64> @ugt_3_v2i64(<2 x i64> %0) { ; PWR5-LABEL: ugt_3_v2i64: ; PWR5: # %bb.0: -; PWR5-NEXT: lis 5, 21845 -; PWR5-NEXT: lis 6, 13107 -; PWR5-NEXT: ori 5, 5, 21845 -; PWR5-NEXT: rotldi 8, 4, 63 -; PWR5-NEXT: rotldi 9, 3, 63 -; PWR5-NEXT: rldimi 5, 5, 32, 0 -; PWR5-NEXT: and 8, 8, 5 -; PWR5-NEXT: and 5, 9, 5 -; PWR5-NEXT: ori 6, 6, 13107 -; PWR5-NEXT: sub 3, 3, 5 -; PWR5-NEXT: rldimi 6, 6, 32, 0 -; PWR5-NEXT: sub 4, 4, 8 -; PWR5-NEXT: and 8, 3, 6 -; PWR5-NEXT: rotldi 3, 3, 62 -; PWR5-NEXT: and 3, 3, 6 -; PWR5-NEXT: lis 7, 3855 -; PWR5-NEXT: and 5, 4, 6 -; PWR5-NEXT: rotldi 4, 4, 62 -; PWR5-NEXT: add 3, 8, 3 -; PWR5-NEXT: lis 9, 257 -; PWR5-NEXT: ori 7, 7, 3855 -; PWR5-NEXT: and 4, 4, 6 -; PWR5-NEXT: rldicl 6, 3, 60, 4 -; PWR5-NEXT: ori 9, 9, 257 -; PWR5-NEXT: rldimi 7, 7, 32, 0 -; PWR5-NEXT: add 4, 5, 4 -; PWR5-NEXT: add 3, 3, 6 -; PWR5-NEXT: rldimi 9, 9, 32, 0 -; PWR5-NEXT: rldicl 5, 4, 60, 4 -; PWR5-NEXT: and 3, 3, 7 -; PWR5-NEXT: add 4, 4, 5 -; PWR5-NEXT: mulld 3, 3, 9 -; PWR5-NEXT: and 4, 4, 7 -; PWR5-NEXT: rldicl 3, 3, 8, 56 -; PWR5-NEXT: mulld 4, 4, 9 -; PWR5-NEXT: li 5, 3 -; PWR5-NEXT: subfic 3, 3, 3 -; PWR5-NEXT: rldicl 4, 4, 8, 56 -; PWR5-NEXT: subfe 3, 5, 5 -; PWR5-NEXT: subfic 4, 4, 3 -; PWR5-NEXT: subfe 4, 5, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: subfic 3, 3, 0 +; PWR5-NEXT: subfe 3, 3, 3 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: subfic 4, 4, 0 +; PWR5-NEXT: subfe 4, 4, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_3_v2i64: ; PWR6: # %bb.0: -; PWR6-NEXT: lis 5, 21845 -; PWR6-NEXT: lis 6, 13107 -; PWR6-NEXT: ori 5, 5, 21845 -; PWR6-NEXT: rotldi 8, 4, 63 -; PWR6-NEXT: rotldi 9, 3, 63 -; PWR6-NEXT: rldimi 5, 5, 32, 0 -; PWR6-NEXT: and 8, 8, 5 -; PWR6-NEXT: and 5, 9, 5 -; PWR6-NEXT: ori 6, 6, 13107 -; PWR6-NEXT: sub 3, 3, 5 -; PWR6-NEXT: rldimi 6, 6, 32, 0 -; PWR6-NEXT: sub 4, 4, 8 -; PWR6-NEXT: and 8, 3, 6 -; PWR6-NEXT: rotldi 3, 3, 62 -; PWR6-NEXT: and 3, 3, 6 -; PWR6-NEXT: lis 7, 3855 -; PWR6-NEXT: and 5, 4, 6 -; PWR6-NEXT: rotldi 4, 4, 62 -; PWR6-NEXT: add 3, 8, 3 -; PWR6-NEXT: lis 9, 257 -; PWR6-NEXT: ori 7, 7, 3855 -; PWR6-NEXT: and 4, 4, 6 -; PWR6-NEXT: rldicl 6, 3, 60, 4 -; PWR6-NEXT: ori 9, 9, 257 -; PWR6-NEXT: rldimi 7, 7, 32, 0 -; PWR6-NEXT: add 4, 5, 4 -; PWR6-NEXT: add 3, 3, 6 -; PWR6-NEXT: rldimi 9, 9, 32, 0 -; PWR6-NEXT: rldicl 5, 4, 60, 4 -; PWR6-NEXT: and 3, 3, 7 -; PWR6-NEXT: add 4, 4, 5 -; PWR6-NEXT: mulld 3, 3, 9 -; PWR6-NEXT: and 4, 4, 7 -; PWR6-NEXT: rldicl 3, 3, 8, 56 -; PWR6-NEXT: mulld 4, 4, 9 -; PWR6-NEXT: li 5, 3 -; PWR6-NEXT: subfic 3, 3, 3 -; PWR6-NEXT: rldicl 4, 4, 8, 56 -; PWR6-NEXT: subfe 3, 5, 5 -; PWR6-NEXT: subfic 4, 4, 3 -; PWR6-NEXT: subfe 4, 5, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: subfic 3, 3, 0 +; PWR6-NEXT: subfe 3, 3, 3 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: subfic 4, 4, 0 +; PWR6-NEXT: subfe 4, 4, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_3_v2i64: @@ -12481,91 +11203,41 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; PWR5-LABEL: ult_4_v2i64: ; PWR5: # %bb.0: -; PWR5-NEXT: lis 5, 21845 -; PWR5-NEXT: lis 6, 13107 -; PWR5-NEXT: ori 5, 5, 21845 -; PWR5-NEXT: rotldi 8, 4, 63 -; PWR5-NEXT: rotldi 9, 3, 63 -; PWR5-NEXT: rldimi 5, 5, 32, 0 -; PWR5-NEXT: and 8, 8, 5 -; PWR5-NEXT: and 5, 9, 5 -; PWR5-NEXT: ori 6, 6, 13107 -; PWR5-NEXT: sub 3, 3, 5 -; PWR5-NEXT: rldimi 6, 6, 32, 0 -; PWR5-NEXT: sub 4, 4, 8 -; PWR5-NEXT: and 8, 3, 6 -; PWR5-NEXT: rotldi 3, 3, 62 -; PWR5-NEXT: and 3, 3, 6 -; PWR5-NEXT: lis 7, 3855 -; PWR5-NEXT: and 5, 4, 6 -; PWR5-NEXT: rotldi 4, 4, 62 -; PWR5-NEXT: add 3, 8, 3 -; PWR5-NEXT: lis 9, 257 -; PWR5-NEXT: ori 7, 7, 3855 -; PWR5-NEXT: and 4, 4, 6 -; PWR5-NEXT: rldicl 6, 3, 60, 4 -; PWR5-NEXT: ori 9, 9, 257 -; PWR5-NEXT: rldimi 7, 7, 32, 0 -; PWR5-NEXT: add 4, 5, 4 -; PWR5-NEXT: add 3, 3, 6 -; PWR5-NEXT: rldimi 9, 9, 32, 0 -; PWR5-NEXT: rldicl 5, 4, 60, 4 -; PWR5-NEXT: and 3, 3, 7 -; PWR5-NEXT: add 4, 4, 5 -; PWR5-NEXT: mulld 3, 3, 9 -; PWR5-NEXT: and 4, 4, 7 -; PWR5-NEXT: rldicl 3, 3, 8, 56 -; PWR5-NEXT: li 5, 4 -; PWR5-NEXT: mulld 4, 4, 9 -; PWR5-NEXT: subc 6, 3, 5 -; PWR5-NEXT: rldicl 4, 4, 8, 56 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: addic 3, 3, -1 ; PWR5-NEXT: subfe 3, 3, 3 -; PWR5-NEXT: subc 5, 4, 5 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addic 4, 4, -1 ; PWR5-NEXT: subfe 4, 4, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_4_v2i64: ; PWR6: # %bb.0: -; PWR6-NEXT: lis 5, 21845 -; PWR6-NEXT: lis 6, 13107 -; PWR6-NEXT: ori 5, 5, 21845 -; PWR6-NEXT: rotldi 8, 4, 63 -; PWR6-NEXT: rotldi 9, 3, 63 -; PWR6-NEXT: rldimi 5, 5, 32, 0 -; PWR6-NEXT: and 8, 8, 5 -; PWR6-NEXT: and 5, 9, 5 -; PWR6-NEXT: ori 6, 6, 13107 -; PWR6-NEXT: sub 3, 3, 5 -; PWR6-NEXT: rldimi 6, 6, 32, 0 -; PWR6-NEXT: sub 4, 4, 8 -; PWR6-NEXT: and 8, 3, 6 -; PWR6-NEXT: rotldi 3, 3, 62 -; PWR6-NEXT: and 3, 3, 6 -; PWR6-NEXT: lis 7, 3855 -; PWR6-NEXT: and 5, 4, 6 -; PWR6-NEXT: rotldi 4, 4, 62 -; PWR6-NEXT: add 3, 8, 3 -; PWR6-NEXT: lis 9, 257 -; PWR6-NEXT: ori 7, 7, 3855 -; PWR6-NEXT: and 4, 4, 6 -; PWR6-NEXT: rldicl 6, 3, 60, 4 -; PWR6-NEXT: ori 9, 9, 257 -; PWR6-NEXT: rldimi 7, 7, 32, 0 -; PWR6-NEXT: add 4, 5, 4 -; PWR6-NEXT: add 3, 3, 6 -; PWR6-NEXT: rldimi 9, 9, 32, 0 -; PWR6-NEXT: rldicl 5, 4, 60, 4 -; PWR6-NEXT: and 3, 3, 7 -; PWR6-NEXT: add 4, 4, 5 -; PWR6-NEXT: mulld 3, 3, 9 -; PWR6-NEXT: and 4, 4, 7 -; PWR6-NEXT: rldicl 3, 3, 8, 56 -; PWR6-NEXT: li 5, 4 -; PWR6-NEXT: mulld 4, 4, 9 -; PWR6-NEXT: subc 6, 3, 5 -; PWR6-NEXT: rldicl 4, 4, 8, 56 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: addic 3, 3, -1 ; PWR6-NEXT: subfe 3, 3, 3 -; PWR6-NEXT: subc 5, 4, 5 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addic 4, 4, -1 ; PWR6-NEXT: subfe 4, 4, 4 ; PWR6-NEXT: blr ; @@ -12615,92 +11287,50 @@ define <2 x i64> @ugt_4_v2i64(<2 x i64> %0) { ; PWR5-LABEL: ugt_4_v2i64: ; PWR5: # %bb.0: -; PWR5-NEXT: lis 5, 21845 -; PWR5-NEXT: lis 6, 13107 -; PWR5-NEXT: ori 5, 5, 21845 -; PWR5-NEXT: rotldi 8, 4, 63 -; PWR5-NEXT: rotldi 9, 3, 63 -; PWR5-NEXT: rldimi 5, 5, 32, 0 -; PWR5-NEXT: and 8, 8, 5 -; PWR5-NEXT: and 5, 9, 5 -; PWR5-NEXT: ori 6, 6, 13107 -; PWR5-NEXT: sub 3, 3, 5 -; PWR5-NEXT: rldimi 6, 6, 32, 0 -; PWR5-NEXT: sub 4, 4, 8 -; PWR5-NEXT: and 8, 3, 6 -; PWR5-NEXT: rotldi 3, 3, 62 -; PWR5-NEXT: and 3, 3, 6 -; PWR5-NEXT: lis 7, 3855 -; PWR5-NEXT: and 5, 4, 6 -; PWR5-NEXT: rotldi 4, 4, 62 -; PWR5-NEXT: add 3, 8, 3 -; PWR5-NEXT: lis 9, 257 -; PWR5-NEXT: ori 7, 7, 3855 -; PWR5-NEXT: and 4, 4, 6 -; PWR5-NEXT: rldicl 6, 3, 60, 4 -; PWR5-NEXT: ori 9, 9, 257 -; PWR5-NEXT: rldimi 7, 7, 32, 0 -; PWR5-NEXT: add 4, 5, 4 -; PWR5-NEXT: add 3, 3, 6 -; PWR5-NEXT: rldimi 9, 9, 32, 0 -; PWR5-NEXT: rldicl 5, 4, 60, 4 -; PWR5-NEXT: and 3, 3, 7 -; PWR5-NEXT: add 4, 4, 5 -; PWR5-NEXT: mulld 3, 3, 9 -; PWR5-NEXT: and 4, 4, 7 -; PWR5-NEXT: rldicl 3, 3, 8, 56 -; PWR5-NEXT: mulld 4, 4, 9 -; PWR5-NEXT: li 5, 4 -; PWR5-NEXT: subfic 3, 3, 4 -; PWR5-NEXT: rldicl 4, 4, 8, 56 -; PWR5-NEXT: subfe 3, 5, 5 -; PWR5-NEXT: subfic 4, 4, 4 -; PWR5-NEXT: subfe 4, 5, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: subfic 3, 3, 0 +; PWR5-NEXT: subfe 3, 3, 3 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: subfic 4, 4, 0 +; PWR5-NEXT: subfe 4, 4, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_4_v2i64: ; PWR6: # %bb.0: -; PWR6-NEXT: lis 5, 21845 -; PWR6-NEXT: lis 6, 13107 -; PWR6-NEXT: ori 5, 5, 21845 -; PWR6-NEXT: rotldi 8, 4, 63 -; PWR6-NEXT: rotldi 9, 3, 63 -; PWR6-NEXT: rldimi 5, 5, 32, 0 -; PWR6-NEXT: and 8, 8, 5 -; PWR6-NEXT: and 5, 9, 5 -; PWR6-NEXT: ori 6, 6, 13107 -; PWR6-NEXT: sub 3, 3, 5 -; PWR6-NEXT: rldimi 6, 6, 32, 0 -; PWR6-NEXT: sub 4, 4, 8 -; PWR6-NEXT: and 8, 3, 6 -; PWR6-NEXT: rotldi 3, 3, 62 -; PWR6-NEXT: and 3, 3, 6 -; PWR6-NEXT: lis 7, 3855 -; PWR6-NEXT: and 5, 4, 6 -; PWR6-NEXT: rotldi 4, 4, 62 -; PWR6-NEXT: add 3, 8, 3 -; PWR6-NEXT: lis 9, 257 -; PWR6-NEXT: ori 7, 7, 3855 -; PWR6-NEXT: and 4, 4, 6 -; PWR6-NEXT: rldicl 6, 3, 60, 4 -; PWR6-NEXT: ori 9, 9, 257 -; PWR6-NEXT: rldimi 7, 7, 32, 0 -; PWR6-NEXT: add 4, 5, 4 -; PWR6-NEXT: add 3, 3, 6 -; PWR6-NEXT: rldimi 9, 9, 32, 0 -; PWR6-NEXT: rldicl 5, 4, 60, 4 -; PWR6-NEXT: and 3, 3, 7 -; PWR6-NEXT: add 4, 4, 5 -; PWR6-NEXT: mulld 3, 3, 9 -; PWR6-NEXT: and 4, 4, 7 -; PWR6-NEXT: rldicl 3, 3, 8, 56 -; PWR6-NEXT: mulld 4, 4, 9 -; PWR6-NEXT: li 5, 4 -; PWR6-NEXT: subfic 3, 3, 4 -; PWR6-NEXT: rldicl 4, 4, 8, 56 -; PWR6-NEXT: subfe 3, 5, 5 -; PWR6-NEXT: subfic 4, 4, 4 -; PWR6-NEXT: subfe 4, 5, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: subfic 3, 3, 0 +; PWR6-NEXT: subfe 3, 3, 3 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: subfic 4, 4, 0 +; PWR6-NEXT: subfe 4, 4, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_4_v2i64: @@ -12749,91 +11379,49 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; PWR5-LABEL: ult_5_v2i64: ; PWR5: # %bb.0: -; PWR5-NEXT: lis 5, 21845 -; PWR5-NEXT: lis 6, 13107 -; PWR5-NEXT: ori 5, 5, 21845 -; PWR5-NEXT: rotldi 8, 4, 63 -; PWR5-NEXT: rotldi 9, 3, 63 -; PWR5-NEXT: rldimi 5, 5, 32, 0 -; PWR5-NEXT: and 8, 8, 5 -; PWR5-NEXT: and 5, 9, 5 -; PWR5-NEXT: ori 6, 6, 13107 -; PWR5-NEXT: sub 3, 3, 5 -; PWR5-NEXT: rldimi 6, 6, 32, 0 -; PWR5-NEXT: sub 4, 4, 8 -; PWR5-NEXT: and 8, 3, 6 -; PWR5-NEXT: rotldi 3, 3, 62 -; PWR5-NEXT: and 3, 3, 6 -; PWR5-NEXT: lis 7, 3855 -; PWR5-NEXT: and 5, 4, 6 -; PWR5-NEXT: rotldi 4, 4, 62 -; PWR5-NEXT: add 3, 8, 3 -; PWR5-NEXT: lis 9, 257 -; PWR5-NEXT: ori 7, 7, 3855 -; PWR5-NEXT: and 4, 4, 6 -; PWR5-NEXT: rldicl 6, 3, 60, 4 -; PWR5-NEXT: ori 9, 9, 257 -; PWR5-NEXT: rldimi 7, 7, 32, 0 -; PWR5-NEXT: add 4, 5, 4 -; PWR5-NEXT: add 3, 3, 6 -; PWR5-NEXT: rldimi 9, 9, 32, 0 -; PWR5-NEXT: rldicl 5, 4, 60, 4 -; PWR5-NEXT: and 3, 3, 7 -; PWR5-NEXT: add 4, 4, 5 -; PWR5-NEXT: mulld 3, 3, 9 -; PWR5-NEXT: and 4, 4, 7 -; PWR5-NEXT: rldicl 3, 3, 8, 56 -; PWR5-NEXT: li 5, 5 -; PWR5-NEXT: mulld 4, 4, 9 -; PWR5-NEXT: subc 6, 3, 5 -; PWR5-NEXT: rldicl 4, 4, 8, 56 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: addic 3, 3, -1 ; PWR5-NEXT: subfe 3, 3, 3 -; PWR5-NEXT: subc 5, 4, 5 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addic 4, 4, -1 ; PWR5-NEXT: subfe 4, 4, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_5_v2i64: ; PWR6: # %bb.0: -; PWR6-NEXT: lis 5, 21845 -; PWR6-NEXT: lis 6, 13107 -; PWR6-NEXT: ori 5, 5, 21845 -; PWR6-NEXT: rotldi 8, 4, 63 -; PWR6-NEXT: rotldi 9, 3, 63 -; PWR6-NEXT: rldimi 5, 5, 32, 0 -; PWR6-NEXT: and 8, 8, 5 -; PWR6-NEXT: and 5, 9, 5 -; PWR6-NEXT: ori 6, 6, 13107 -; PWR6-NEXT: sub 3, 3, 5 -; PWR6-NEXT: rldimi 6, 6, 32, 0 -; PWR6-NEXT: sub 4, 4, 8 -; PWR6-NEXT: and 8, 3, 6 -; PWR6-NEXT: rotldi 3, 3, 62 -; PWR6-NEXT: and 3, 3, 6 -; PWR6-NEXT: lis 7, 3855 -; PWR6-NEXT: and 5, 4, 6 -; PWR6-NEXT: rotldi 4, 4, 62 -; PWR6-NEXT: add 3, 8, 3 -; PWR6-NEXT: lis 9, 257 -; PWR6-NEXT: ori 7, 7, 3855 -; PWR6-NEXT: and 4, 4, 6 -; PWR6-NEXT: rldicl 6, 3, 60, 4 -; PWR6-NEXT: ori 9, 9, 257 -; PWR6-NEXT: rldimi 7, 7, 32, 0 -; PWR6-NEXT: add 4, 5, 4 -; PWR6-NEXT: add 3, 3, 6 -; PWR6-NEXT: rldimi 9, 9, 32, 0 -; PWR6-NEXT: rldicl 5, 4, 60, 4 -; PWR6-NEXT: and 3, 3, 7 -; PWR6-NEXT: add 4, 4, 5 -; PWR6-NEXT: mulld 3, 3, 9 -; PWR6-NEXT: and 4, 4, 7 -; PWR6-NEXT: rldicl 3, 3, 8, 56 -; PWR6-NEXT: li 5, 5 -; PWR6-NEXT: mulld 4, 4, 9 -; PWR6-NEXT: subc 6, 3, 5 -; PWR6-NEXT: rldicl 4, 4, 8, 56 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: addic 3, 3, -1 ; PWR6-NEXT: subfe 3, 3, 3 -; PWR6-NEXT: subc 5, 4, 5 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addic 4, 4, -1 ; PWR6-NEXT: subfe 4, 4, 4 ; PWR6-NEXT: blr ; @@ -12883,92 +11471,58 @@ define <2 x i64> @ugt_5_v2i64(<2 x i64> %0) { ; PWR5-LABEL: ugt_5_v2i64: ; PWR5: # %bb.0: -; PWR5-NEXT: lis 5, 21845 -; PWR5-NEXT: lis 6, 13107 -; PWR5-NEXT: ori 5, 5, 21845 -; PWR5-NEXT: rotldi 8, 4, 63 -; PWR5-NEXT: rotldi 9, 3, 63 -; PWR5-NEXT: rldimi 5, 5, 32, 0 -; PWR5-NEXT: and 8, 8, 5 -; PWR5-NEXT: and 5, 9, 5 -; PWR5-NEXT: ori 6, 6, 13107 -; PWR5-NEXT: sub 3, 3, 5 -; PWR5-NEXT: rldimi 6, 6, 32, 0 -; PWR5-NEXT: sub 4, 4, 8 -; PWR5-NEXT: and 8, 3, 6 -; PWR5-NEXT: rotldi 3, 3, 62 -; PWR5-NEXT: and 3, 3, 6 -; PWR5-NEXT: lis 7, 3855 -; PWR5-NEXT: and 5, 4, 6 -; PWR5-NEXT: rotldi 4, 4, 62 -; PWR5-NEXT: add 3, 8, 3 -; PWR5-NEXT: lis 9, 257 -; PWR5-NEXT: ori 7, 7, 3855 -; PWR5-NEXT: and 4, 4, 6 -; PWR5-NEXT: rldicl 6, 3, 60, 4 -; PWR5-NEXT: ori 9, 9, 257 -; PWR5-NEXT: rldimi 7, 7, 32, 0 -; PWR5-NEXT: add 4, 5, 4 -; PWR5-NEXT: add 3, 3, 6 -; PWR5-NEXT: rldimi 9, 9, 32, 0 -; PWR5-NEXT: rldicl 5, 4, 60, 4 -; PWR5-NEXT: and 3, 3, 7 -; PWR5-NEXT: add 4, 4, 5 -; PWR5-NEXT: mulld 3, 3, 9 -; PWR5-NEXT: and 4, 4, 7 -; PWR5-NEXT: rldicl 3, 3, 8, 56 -; PWR5-NEXT: mulld 4, 4, 9 -; PWR5-NEXT: li 5, 5 -; PWR5-NEXT: subfic 3, 3, 5 -; PWR5-NEXT: rldicl 4, 4, 8, 56 -; PWR5-NEXT: subfe 3, 5, 5 -; PWR5-NEXT: subfic 4, 4, 5 -; PWR5-NEXT: subfe 4, 5, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: subfic 3, 3, 0 +; PWR5-NEXT: subfe 3, 3, 3 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: subfic 4, 4, 0 +; PWR5-NEXT: subfe 4, 4, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_5_v2i64: ; PWR6: # %bb.0: -; PWR6-NEXT: lis 5, 21845 -; PWR6-NEXT: lis 6, 13107 -; PWR6-NEXT: ori 5, 5, 21845 -; PWR6-NEXT: rotldi 8, 4, 63 -; PWR6-NEXT: rotldi 9, 3, 63 -; PWR6-NEXT: rldimi 5, 5, 32, 0 -; PWR6-NEXT: and 8, 8, 5 -; PWR6-NEXT: and 5, 9, 5 -; PWR6-NEXT: ori 6, 6, 13107 -; PWR6-NEXT: sub 3, 3, 5 -; PWR6-NEXT: rldimi 6, 6, 32, 0 -; PWR6-NEXT: sub 4, 4, 8 -; PWR6-NEXT: and 8, 3, 6 -; PWR6-NEXT: rotldi 3, 3, 62 -; PWR6-NEXT: and 3, 3, 6 -; PWR6-NEXT: lis 7, 3855 -; PWR6-NEXT: and 5, 4, 6 -; PWR6-NEXT: rotldi 4, 4, 62 -; PWR6-NEXT: add 3, 8, 3 -; PWR6-NEXT: lis 9, 257 -; PWR6-NEXT: ori 7, 7, 3855 -; PWR6-NEXT: and 4, 4, 6 -; PWR6-NEXT: rldicl 6, 3, 60, 4 -; PWR6-NEXT: ori 9, 9, 257 -; PWR6-NEXT: rldimi 7, 7, 32, 0 -; PWR6-NEXT: add 4, 5, 4 -; PWR6-NEXT: add 3, 3, 6 -; PWR6-NEXT: rldimi 9, 9, 32, 0 -; PWR6-NEXT: rldicl 5, 4, 60, 4 -; PWR6-NEXT: and 3, 3, 7 -; PWR6-NEXT: add 4, 4, 5 -; PWR6-NEXT: mulld 3, 3, 9 -; PWR6-NEXT: and 4, 4, 7 -; PWR6-NEXT: rldicl 3, 3, 8, 56 -; PWR6-NEXT: mulld 4, 4, 9 -; PWR6-NEXT: li 5, 5 -; PWR6-NEXT: subfic 3, 3, 5 -; PWR6-NEXT: rldicl 4, 4, 8, 56 -; PWR6-NEXT: subfe 3, 5, 5 -; PWR6-NEXT: subfic 4, 4, 5 -; PWR6-NEXT: subfe 4, 5, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: subfic 3, 3, 0 +; PWR6-NEXT: subfe 3, 3, 3 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: subfic 4, 4, 0 +; PWR6-NEXT: subfe 4, 4, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_5_v2i64: @@ -13017,91 +11571,57 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; PWR5-LABEL: ult_6_v2i64: ; PWR5: # %bb.0: -; PWR5-NEXT: lis 5, 21845 -; PWR5-NEXT: lis 6, 13107 -; PWR5-NEXT: ori 5, 5, 21845 -; PWR5-NEXT: rotldi 8, 4, 63 -; PWR5-NEXT: rotldi 9, 3, 63 -; PWR5-NEXT: rldimi 5, 5, 32, 0 -; PWR5-NEXT: and 8, 8, 5 -; PWR5-NEXT: and 5, 9, 5 -; PWR5-NEXT: ori 6, 6, 13107 -; PWR5-NEXT: sub 3, 3, 5 -; PWR5-NEXT: rldimi 6, 6, 32, 0 -; PWR5-NEXT: sub 4, 4, 8 -; PWR5-NEXT: and 8, 3, 6 -; PWR5-NEXT: rotldi 3, 3, 62 -; PWR5-NEXT: and 3, 3, 6 -; PWR5-NEXT: lis 7, 3855 -; PWR5-NEXT: and 5, 4, 6 -; PWR5-NEXT: rotldi 4, 4, 62 -; PWR5-NEXT: add 3, 8, 3 -; PWR5-NEXT: lis 9, 257 -; PWR5-NEXT: ori 7, 7, 3855 -; PWR5-NEXT: and 4, 4, 6 -; PWR5-NEXT: rldicl 6, 3, 60, 4 -; PWR5-NEXT: ori 9, 9, 257 -; PWR5-NEXT: rldimi 7, 7, 32, 0 -; PWR5-NEXT: add 4, 5, 4 -; PWR5-NEXT: add 3, 3, 6 -; PWR5-NEXT: rldimi 9, 9, 32, 0 -; PWR5-NEXT: rldicl 5, 4, 60, 4 -; PWR5-NEXT: and 3, 3, 7 -; PWR5-NEXT: add 4, 4, 5 -; PWR5-NEXT: mulld 3, 3, 9 -; PWR5-NEXT: and 4, 4, 7 -; PWR5-NEXT: rldicl 3, 3, 8, 56 -; PWR5-NEXT: li 5, 6 -; PWR5-NEXT: mulld 4, 4, 9 -; PWR5-NEXT: subc 6, 3, 5 -; PWR5-NEXT: rldicl 4, 4, 8, 56 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: addic 3, 3, -1 ; PWR5-NEXT: subfe 3, 3, 3 -; PWR5-NEXT: subc 5, 4, 5 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addic 4, 4, -1 ; PWR5-NEXT: subfe 4, 4, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_6_v2i64: ; PWR6: # %bb.0: -; PWR6-NEXT: lis 5, 21845 -; PWR6-NEXT: lis 6, 13107 -; PWR6-NEXT: ori 5, 5, 21845 -; PWR6-NEXT: rotldi 8, 4, 63 -; PWR6-NEXT: rotldi 9, 3, 63 -; PWR6-NEXT: rldimi 5, 5, 32, 0 -; PWR6-NEXT: and 8, 8, 5 -; PWR6-NEXT: and 5, 9, 5 -; PWR6-NEXT: ori 6, 6, 13107 -; PWR6-NEXT: sub 3, 3, 5 -; PWR6-NEXT: rldimi 6, 6, 32, 0 -; PWR6-NEXT: sub 4, 4, 8 -; PWR6-NEXT: and 8, 3, 6 -; PWR6-NEXT: rotldi 3, 3, 62 -; PWR6-NEXT: and 3, 3, 6 -; PWR6-NEXT: lis 7, 3855 -; PWR6-NEXT: and 5, 4, 6 -; PWR6-NEXT: rotldi 4, 4, 62 -; PWR6-NEXT: add 3, 8, 3 -; PWR6-NEXT: lis 9, 257 -; PWR6-NEXT: ori 7, 7, 3855 -; PWR6-NEXT: and 4, 4, 6 -; PWR6-NEXT: rldicl 6, 3, 60, 4 -; PWR6-NEXT: ori 9, 9, 257 -; PWR6-NEXT: rldimi 7, 7, 32, 0 -; PWR6-NEXT: add 4, 5, 4 -; PWR6-NEXT: add 3, 3, 6 -; PWR6-NEXT: rldimi 9, 9, 32, 0 -; PWR6-NEXT: rldicl 5, 4, 60, 4 -; PWR6-NEXT: and 3, 3, 7 -; PWR6-NEXT: add 4, 4, 5 -; PWR6-NEXT: mulld 3, 3, 9 -; PWR6-NEXT: and 4, 4, 7 -; PWR6-NEXT: rldicl 3, 3, 8, 56 -; PWR6-NEXT: li 5, 6 -; PWR6-NEXT: mulld 4, 4, 9 -; PWR6-NEXT: subc 6, 3, 5 -; PWR6-NEXT: rldicl 4, 4, 8, 56 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: addic 3, 3, -1 ; PWR6-NEXT: subfe 3, 3, 3 -; PWR6-NEXT: subc 5, 4, 5 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addic 4, 4, -1 ; PWR6-NEXT: subfe 4, 4, 4 ; PWR6-NEXT: blr ; @@ -13151,92 +11671,66 @@ define <2 x i64> @ugt_6_v2i64(<2 x i64> %0) { ; PWR5-LABEL: ugt_6_v2i64: ; PWR5: # %bb.0: -; PWR5-NEXT: lis 5, 21845 -; PWR5-NEXT: lis 6, 13107 -; PWR5-NEXT: ori 5, 5, 21845 -; PWR5-NEXT: rotldi 8, 4, 63 -; PWR5-NEXT: rotldi 9, 3, 63 -; PWR5-NEXT: rldimi 5, 5, 32, 0 -; PWR5-NEXT: and 8, 8, 5 -; PWR5-NEXT: and 5, 9, 5 -; PWR5-NEXT: ori 6, 6, 13107 -; PWR5-NEXT: sub 3, 3, 5 -; PWR5-NEXT: rldimi 6, 6, 32, 0 -; PWR5-NEXT: sub 4, 4, 8 -; PWR5-NEXT: and 8, 3, 6 -; PWR5-NEXT: rotldi 3, 3, 62 -; PWR5-NEXT: and 3, 3, 6 -; PWR5-NEXT: lis 7, 3855 -; PWR5-NEXT: and 5, 4, 6 -; PWR5-NEXT: rotldi 4, 4, 62 -; PWR5-NEXT: add 3, 8, 3 -; PWR5-NEXT: lis 9, 257 -; PWR5-NEXT: ori 7, 7, 3855 -; PWR5-NEXT: and 4, 4, 6 -; PWR5-NEXT: rldicl 6, 3, 60, 4 -; PWR5-NEXT: ori 9, 9, 257 -; PWR5-NEXT: rldimi 7, 7, 32, 0 -; PWR5-NEXT: add 4, 5, 4 -; PWR5-NEXT: add 3, 3, 6 -; PWR5-NEXT: rldimi 9, 9, 32, 0 -; PWR5-NEXT: rldicl 5, 4, 60, 4 -; PWR5-NEXT: and 3, 3, 7 -; PWR5-NEXT: add 4, 4, 5 -; PWR5-NEXT: mulld 3, 3, 9 -; PWR5-NEXT: and 4, 4, 7 -; PWR5-NEXT: rldicl 3, 3, 8, 56 -; PWR5-NEXT: mulld 4, 4, 9 -; PWR5-NEXT: li 5, 6 -; PWR5-NEXT: subfic 3, 3, 6 -; PWR5-NEXT: rldicl 4, 4, 8, 56 -; PWR5-NEXT: subfe 3, 5, 5 -; PWR5-NEXT: subfic 4, 4, 6 -; PWR5-NEXT: subfe 4, 5, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: subfic 3, 3, 0 +; PWR5-NEXT: subfe 3, 3, 3 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: subfic 4, 4, 0 +; PWR5-NEXT: subfe 4, 4, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_6_v2i64: ; PWR6: # %bb.0: -; PWR6-NEXT: lis 5, 21845 -; PWR6-NEXT: lis 6, 13107 -; PWR6-NEXT: ori 5, 5, 21845 -; PWR6-NEXT: rotldi 8, 4, 63 -; PWR6-NEXT: rotldi 9, 3, 63 -; PWR6-NEXT: rldimi 5, 5, 32, 0 -; PWR6-NEXT: and 8, 8, 5 -; PWR6-NEXT: and 5, 9, 5 -; PWR6-NEXT: ori 6, 6, 13107 -; PWR6-NEXT: sub 3, 3, 5 -; PWR6-NEXT: rldimi 6, 6, 32, 0 -; PWR6-NEXT: sub 4, 4, 8 -; PWR6-NEXT: and 8, 3, 6 -; PWR6-NEXT: rotldi 3, 3, 62 -; PWR6-NEXT: and 3, 3, 6 -; PWR6-NEXT: lis 7, 3855 -; PWR6-NEXT: and 5, 4, 6 -; PWR6-NEXT: rotldi 4, 4, 62 -; PWR6-NEXT: add 3, 8, 3 -; PWR6-NEXT: lis 9, 257 -; PWR6-NEXT: ori 7, 7, 3855 -; PWR6-NEXT: and 4, 4, 6 -; PWR6-NEXT: rldicl 6, 3, 60, 4 -; PWR6-NEXT: ori 9, 9, 257 -; PWR6-NEXT: rldimi 7, 7, 32, 0 -; PWR6-NEXT: add 4, 5, 4 -; PWR6-NEXT: add 3, 3, 6 -; PWR6-NEXT: rldimi 9, 9, 32, 0 -; PWR6-NEXT: rldicl 5, 4, 60, 4 -; PWR6-NEXT: and 3, 3, 7 -; PWR6-NEXT: add 4, 4, 5 -; PWR6-NEXT: mulld 3, 3, 9 -; PWR6-NEXT: and 4, 4, 7 -; PWR6-NEXT: rldicl 3, 3, 8, 56 -; PWR6-NEXT: mulld 4, 4, 9 -; PWR6-NEXT: li 5, 6 -; PWR6-NEXT: subfic 3, 3, 6 -; PWR6-NEXT: rldicl 4, 4, 8, 56 -; PWR6-NEXT: subfe 3, 5, 5 -; PWR6-NEXT: subfic 4, 4, 6 -; PWR6-NEXT: subfe 4, 5, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: subfic 3, 3, 0 +; PWR6-NEXT: subfe 3, 3, 3 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: subfic 4, 4, 0 +; PWR6-NEXT: subfe 4, 4, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_6_v2i64: @@ -13285,91 +11779,65 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; PWR5-LABEL: ult_7_v2i64: ; PWR5: # %bb.0: -; PWR5-NEXT: lis 5, 21845 -; PWR5-NEXT: lis 6, 13107 -; PWR5-NEXT: ori 5, 5, 21845 -; PWR5-NEXT: rotldi 8, 4, 63 -; PWR5-NEXT: rotldi 9, 3, 63 -; PWR5-NEXT: rldimi 5, 5, 32, 0 -; PWR5-NEXT: and 8, 8, 5 -; PWR5-NEXT: and 5, 9, 5 -; PWR5-NEXT: ori 6, 6, 13107 -; PWR5-NEXT: sub 3, 3, 5 -; PWR5-NEXT: rldimi 6, 6, 32, 0 -; PWR5-NEXT: sub 4, 4, 8 -; PWR5-NEXT: and 8, 3, 6 -; PWR5-NEXT: rotldi 3, 3, 62 -; PWR5-NEXT: and 3, 3, 6 -; PWR5-NEXT: lis 7, 3855 -; PWR5-NEXT: and 5, 4, 6 -; PWR5-NEXT: rotldi 4, 4, 62 -; PWR5-NEXT: add 3, 8, 3 -; PWR5-NEXT: lis 9, 257 -; PWR5-NEXT: ori 7, 7, 3855 -; PWR5-NEXT: and 4, 4, 6 -; PWR5-NEXT: rldicl 6, 3, 60, 4 -; PWR5-NEXT: ori 9, 9, 257 -; PWR5-NEXT: rldimi 7, 7, 32, 0 -; PWR5-NEXT: add 4, 5, 4 -; PWR5-NEXT: add 3, 3, 6 -; PWR5-NEXT: rldimi 9, 9, 32, 0 -; PWR5-NEXT: rldicl 5, 4, 60, 4 -; PWR5-NEXT: and 3, 3, 7 -; PWR5-NEXT: add 4, 4, 5 -; PWR5-NEXT: mulld 3, 3, 9 -; PWR5-NEXT: and 4, 4, 7 -; PWR5-NEXT: rldicl 3, 3, 8, 56 -; PWR5-NEXT: li 5, 7 -; PWR5-NEXT: mulld 4, 4, 9 -; PWR5-NEXT: subc 6, 3, 5 -; PWR5-NEXT: rldicl 4, 4, 8, 56 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: addic 3, 3, -1 ; PWR5-NEXT: subfe 3, 3, 3 -; PWR5-NEXT: subc 5, 4, 5 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addic 4, 4, -1 ; PWR5-NEXT: subfe 4, 4, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_7_v2i64: ; PWR6: # %bb.0: -; PWR6-NEXT: lis 5, 21845 -; PWR6-NEXT: lis 6, 13107 -; PWR6-NEXT: ori 5, 5, 21845 -; PWR6-NEXT: rotldi 8, 4, 63 -; PWR6-NEXT: rotldi 9, 3, 63 -; PWR6-NEXT: rldimi 5, 5, 32, 0 -; PWR6-NEXT: and 8, 8, 5 -; PWR6-NEXT: and 5, 9, 5 -; PWR6-NEXT: ori 6, 6, 13107 -; PWR6-NEXT: sub 3, 3, 5 -; PWR6-NEXT: rldimi 6, 6, 32, 0 -; PWR6-NEXT: sub 4, 4, 8 -; PWR6-NEXT: and 8, 3, 6 -; PWR6-NEXT: rotldi 3, 3, 62 -; PWR6-NEXT: and 3, 3, 6 -; PWR6-NEXT: lis 7, 3855 -; PWR6-NEXT: and 5, 4, 6 -; PWR6-NEXT: rotldi 4, 4, 62 -; PWR6-NEXT: add 3, 8, 3 -; PWR6-NEXT: lis 9, 257 -; PWR6-NEXT: ori 7, 7, 3855 -; PWR6-NEXT: and 4, 4, 6 -; PWR6-NEXT: rldicl 6, 3, 60, 4 -; PWR6-NEXT: ori 9, 9, 257 -; PWR6-NEXT: rldimi 7, 7, 32, 0 -; PWR6-NEXT: add 4, 5, 4 -; PWR6-NEXT: add 3, 3, 6 -; PWR6-NEXT: rldimi 9, 9, 32, 0 -; PWR6-NEXT: rldicl 5, 4, 60, 4 -; PWR6-NEXT: and 3, 3, 7 -; PWR6-NEXT: add 4, 4, 5 -; PWR6-NEXT: mulld 3, 3, 9 -; PWR6-NEXT: and 4, 4, 7 -; PWR6-NEXT: rldicl 3, 3, 8, 56 -; PWR6-NEXT: li 5, 7 -; PWR6-NEXT: mulld 4, 4, 9 -; PWR6-NEXT: subc 6, 3, 5 -; PWR6-NEXT: rldicl 4, 4, 8, 56 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: addic 3, 3, -1 ; PWR6-NEXT: subfe 3, 3, 3 -; PWR6-NEXT: subc 5, 4, 5 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addic 4, 4, -1 ; PWR6-NEXT: subfe 4, 4, 4 ; PWR6-NEXT: blr ; @@ -13419,92 +11887,74 @@ define <2 x i64> @ugt_7_v2i64(<2 x i64> %0) { ; PWR5-LABEL: ugt_7_v2i64: ; PWR5: # %bb.0: -; PWR5-NEXT: lis 5, 21845 -; PWR5-NEXT: lis 6, 13107 -; PWR5-NEXT: ori 5, 5, 21845 -; PWR5-NEXT: rotldi 8, 4, 63 -; PWR5-NEXT: rotldi 9, 3, 63 -; PWR5-NEXT: rldimi 5, 5, 32, 0 -; PWR5-NEXT: and 8, 8, 5 -; PWR5-NEXT: and 5, 9, 5 -; PWR5-NEXT: ori 6, 6, 13107 -; PWR5-NEXT: sub 3, 3, 5 -; PWR5-NEXT: rldimi 6, 6, 32, 0 -; PWR5-NEXT: sub 4, 4, 8 -; PWR5-NEXT: and 8, 3, 6 -; PWR5-NEXT: rotldi 3, 3, 62 -; PWR5-NEXT: and 3, 3, 6 -; PWR5-NEXT: lis 7, 3855 -; PWR5-NEXT: and 5, 4, 6 -; PWR5-NEXT: rotldi 4, 4, 62 -; PWR5-NEXT: add 3, 8, 3 -; PWR5-NEXT: lis 9, 257 -; PWR5-NEXT: ori 7, 7, 3855 -; PWR5-NEXT: and 4, 4, 6 -; PWR5-NEXT: rldicl 6, 3, 60, 4 -; PWR5-NEXT: ori 9, 9, 257 -; PWR5-NEXT: rldimi 7, 7, 32, 0 -; PWR5-NEXT: add 4, 5, 4 -; PWR5-NEXT: add 3, 3, 6 -; PWR5-NEXT: rldimi 9, 9, 32, 0 -; PWR5-NEXT: rldicl 5, 4, 60, 4 -; PWR5-NEXT: and 3, 3, 7 -; PWR5-NEXT: add 4, 4, 5 -; PWR5-NEXT: mulld 3, 3, 9 -; PWR5-NEXT: and 4, 4, 7 -; PWR5-NEXT: rldicl 3, 3, 8, 56 -; PWR5-NEXT: mulld 4, 4, 9 -; PWR5-NEXT: li 5, 7 -; PWR5-NEXT: subfic 3, 3, 7 -; PWR5-NEXT: rldicl 4, 4, 8, 56 -; PWR5-NEXT: subfe 3, 5, 5 -; PWR5-NEXT: subfic 4, 4, 7 -; PWR5-NEXT: subfe 4, 5, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: subfic 3, 3, 0 +; PWR5-NEXT: subfe 3, 3, 3 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: subfic 4, 4, 0 +; PWR5-NEXT: subfe 4, 4, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_7_v2i64: ; PWR6: # %bb.0: -; PWR6-NEXT: lis 5, 21845 -; PWR6-NEXT: lis 6, 13107 -; PWR6-NEXT: ori 5, 5, 21845 -; PWR6-NEXT: rotldi 8, 4, 63 -; PWR6-NEXT: rotldi 9, 3, 63 -; PWR6-NEXT: rldimi 5, 5, 32, 0 -; PWR6-NEXT: and 8, 8, 5 -; PWR6-NEXT: and 5, 9, 5 -; PWR6-NEXT: ori 6, 6, 13107 -; PWR6-NEXT: sub 3, 3, 5 -; PWR6-NEXT: rldimi 6, 6, 32, 0 -; PWR6-NEXT: sub 4, 4, 8 -; PWR6-NEXT: and 8, 3, 6 -; PWR6-NEXT: rotldi 3, 3, 62 -; PWR6-NEXT: and 3, 3, 6 -; PWR6-NEXT: lis 7, 3855 -; PWR6-NEXT: and 5, 4, 6 -; PWR6-NEXT: rotldi 4, 4, 62 -; PWR6-NEXT: add 3, 8, 3 -; PWR6-NEXT: lis 9, 257 -; PWR6-NEXT: ori 7, 7, 3855 -; PWR6-NEXT: and 4, 4, 6 -; PWR6-NEXT: rldicl 6, 3, 60, 4 -; PWR6-NEXT: ori 9, 9, 257 -; PWR6-NEXT: rldimi 7, 7, 32, 0 -; PWR6-NEXT: add 4, 5, 4 -; PWR6-NEXT: add 3, 3, 6 -; PWR6-NEXT: rldimi 9, 9, 32, 0 -; PWR6-NEXT: rldicl 5, 4, 60, 4 -; PWR6-NEXT: and 3, 3, 7 -; PWR6-NEXT: add 4, 4, 5 -; PWR6-NEXT: mulld 3, 3, 9 -; PWR6-NEXT: and 4, 4, 7 -; PWR6-NEXT: rldicl 3, 3, 8, 56 -; PWR6-NEXT: mulld 4, 4, 9 -; PWR6-NEXT: li 5, 7 -; PWR6-NEXT: subfic 3, 3, 7 -; PWR6-NEXT: rldicl 4, 4, 8, 56 -; PWR6-NEXT: subfe 3, 5, 5 -; PWR6-NEXT: subfic 4, 4, 7 -; PWR6-NEXT: subfe 4, 5, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: subfic 3, 3, 0 +; PWR6-NEXT: subfe 3, 3, 3 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: subfic 4, 4, 0 +; PWR6-NEXT: subfe 4, 4, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_7_v2i64: @@ -13553,91 +12003,73 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) { ; PWR5-LABEL: ult_8_v2i64: ; PWR5: # %bb.0: -; PWR5-NEXT: lis 5, 21845 -; PWR5-NEXT: lis 6, 13107 -; PWR5-NEXT: ori 5, 5, 21845 -; PWR5-NEXT: rotldi 8, 4, 63 -; PWR5-NEXT: rotldi 9, 3, 63 -; PWR5-NEXT: rldimi 5, 5, 32, 0 -; PWR5-NEXT: and 8, 8, 5 -; PWR5-NEXT: and 5, 9, 5 -; PWR5-NEXT: ori 6, 6, 13107 -; PWR5-NEXT: sub 3, 3, 5 -; PWR5-NEXT: rldimi 6, 6, 32, 0 -; PWR5-NEXT: sub 4, 4, 8 -; PWR5-NEXT: and 8, 3, 6 -; PWR5-NEXT: rotldi 3, 3, 62 -; PWR5-NEXT: and 3, 3, 6 -; PWR5-NEXT: lis 7, 3855 -; PWR5-NEXT: and 5, 4, 6 -; PWR5-NEXT: rotldi 4, 4, 62 -; PWR5-NEXT: add 3, 8, 3 -; PWR5-NEXT: lis 9, 257 -; PWR5-NEXT: ori 7, 7, 3855 -; PWR5-NEXT: and 4, 4, 6 -; PWR5-NEXT: rldicl 6, 3, 60, 4 -; PWR5-NEXT: ori 9, 9, 257 -; PWR5-NEXT: rldimi 7, 7, 32, 0 -; PWR5-NEXT: add 4, 5, 4 -; PWR5-NEXT: add 3, 3, 6 -; PWR5-NEXT: rldimi 9, 9, 32, 0 -; PWR5-NEXT: rldicl 5, 4, 60, 4 -; PWR5-NEXT: and 3, 3, 7 -; PWR5-NEXT: add 4, 4, 5 -; PWR5-NEXT: mulld 3, 3, 9 -; PWR5-NEXT: and 4, 4, 7 -; PWR5-NEXT: rldicl 3, 3, 8, 56 -; PWR5-NEXT: li 5, 8 -; PWR5-NEXT: mulld 4, 4, 9 -; PWR5-NEXT: subc 6, 3, 5 -; PWR5-NEXT: rldicl 4, 4, 8, 56 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: addic 3, 3, -1 ; PWR5-NEXT: subfe 3, 3, 3 -; PWR5-NEXT: subc 5, 4, 5 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addic 4, 4, -1 ; PWR5-NEXT: subfe 4, 4, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_8_v2i64: ; PWR6: # %bb.0: -; PWR6-NEXT: lis 5, 21845 -; PWR6-NEXT: lis 6, 13107 -; PWR6-NEXT: ori 5, 5, 21845 -; PWR6-NEXT: rotldi 8, 4, 63 -; PWR6-NEXT: rotldi 9, 3, 63 -; PWR6-NEXT: rldimi 5, 5, 32, 0 -; PWR6-NEXT: and 8, 8, 5 -; PWR6-NEXT: and 5, 9, 5 -; PWR6-NEXT: ori 6, 6, 13107 -; PWR6-NEXT: sub 3, 3, 5 -; PWR6-NEXT: rldimi 6, 6, 32, 0 -; PWR6-NEXT: sub 4, 4, 8 -; PWR6-NEXT: and 8, 3, 6 -; PWR6-NEXT: rotldi 3, 3, 62 -; PWR6-NEXT: and 3, 3, 6 -; PWR6-NEXT: lis 7, 3855 -; PWR6-NEXT: and 5, 4, 6 -; PWR6-NEXT: rotldi 4, 4, 62 -; PWR6-NEXT: add 3, 8, 3 -; PWR6-NEXT: lis 9, 257 -; PWR6-NEXT: ori 7, 7, 3855 -; PWR6-NEXT: and 4, 4, 6 -; PWR6-NEXT: rldicl 6, 3, 60, 4 -; PWR6-NEXT: ori 9, 9, 257 -; PWR6-NEXT: rldimi 7, 7, 32, 0 -; PWR6-NEXT: add 4, 5, 4 -; PWR6-NEXT: add 3, 3, 6 -; PWR6-NEXT: rldimi 9, 9, 32, 0 -; PWR6-NEXT: rldicl 5, 4, 60, 4 -; PWR6-NEXT: and 3, 3, 7 -; PWR6-NEXT: add 4, 4, 5 -; PWR6-NEXT: mulld 3, 3, 9 -; PWR6-NEXT: and 4, 4, 7 -; PWR6-NEXT: rldicl 3, 3, 8, 56 -; PWR6-NEXT: li 5, 8 -; PWR6-NEXT: mulld 4, 4, 9 -; PWR6-NEXT: subc 6, 3, 5 -; PWR6-NEXT: rldicl 4, 4, 8, 56 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: addic 3, 3, -1 ; PWR6-NEXT: subfe 3, 3, 3 -; PWR6-NEXT: subc 5, 4, 5 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addic 4, 4, -1 ; PWR6-NEXT: subfe 4, 4, 4 ; PWR6-NEXT: blr ; @@ -13687,92 +12119,82 @@ define <2 x i64> @ugt_8_v2i64(<2 x i64> %0) { ; PWR5-LABEL: ugt_8_v2i64: ; PWR5: # %bb.0: -; PWR5-NEXT: lis 5, 21845 -; PWR5-NEXT: lis 6, 13107 -; PWR5-NEXT: ori 5, 5, 21845 -; PWR5-NEXT: rotldi 8, 4, 63 -; PWR5-NEXT: rotldi 9, 3, 63 -; PWR5-NEXT: rldimi 5, 5, 32, 0 -; PWR5-NEXT: and 8, 8, 5 -; PWR5-NEXT: and 5, 9, 5 -; PWR5-NEXT: ori 6, 6, 13107 -; PWR5-NEXT: sub 3, 3, 5 -; PWR5-NEXT: rldimi 6, 6, 32, 0 -; PWR5-NEXT: sub 4, 4, 8 -; PWR5-NEXT: and 8, 3, 6 -; PWR5-NEXT: rotldi 3, 3, 62 -; PWR5-NEXT: and 3, 3, 6 -; PWR5-NEXT: lis 7, 3855 -; PWR5-NEXT: and 5, 4, 6 -; PWR5-NEXT: rotldi 4, 4, 62 -; PWR5-NEXT: add 3, 8, 3 -; PWR5-NEXT: lis 9, 257 -; PWR5-NEXT: ori 7, 7, 3855 -; PWR5-NEXT: and 4, 4, 6 -; PWR5-NEXT: rldicl 6, 3, 60, 4 -; PWR5-NEXT: ori 9, 9, 257 -; PWR5-NEXT: rldimi 7, 7, 32, 0 -; PWR5-NEXT: add 4, 5, 4 -; PWR5-NEXT: add 3, 3, 6 -; PWR5-NEXT: rldimi 9, 9, 32, 0 -; PWR5-NEXT: rldicl 5, 4, 60, 4 -; PWR5-NEXT: and 3, 3, 7 -; PWR5-NEXT: add 4, 4, 5 -; PWR5-NEXT: mulld 3, 3, 9 -; PWR5-NEXT: and 4, 4, 7 -; PWR5-NEXT: rldicl 3, 3, 8, 56 -; PWR5-NEXT: mulld 4, 4, 9 -; PWR5-NEXT: li 5, 8 -; PWR5-NEXT: subfic 3, 3, 8 -; PWR5-NEXT: rldicl 4, 4, 8, 56 -; PWR5-NEXT: subfe 3, 5, 5 -; PWR5-NEXT: subfic 4, 4, 8 -; PWR5-NEXT: subfe 4, 5, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: subfic 3, 3, 0 +; PWR5-NEXT: subfe 3, 3, 3 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: subfic 4, 4, 0 +; PWR5-NEXT: subfe 4, 4, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ugt_8_v2i64: ; PWR6: # %bb.0: -; PWR6-NEXT: lis 5, 21845 -; PWR6-NEXT: lis 6, 13107 -; PWR6-NEXT: ori 5, 5, 21845 -; PWR6-NEXT: rotldi 8, 4, 63 -; PWR6-NEXT: rotldi 9, 3, 63 -; PWR6-NEXT: rldimi 5, 5, 32, 0 -; PWR6-NEXT: and 8, 8, 5 -; PWR6-NEXT: and 5, 9, 5 -; PWR6-NEXT: ori 6, 6, 13107 -; PWR6-NEXT: sub 3, 3, 5 -; PWR6-NEXT: rldimi 6, 6, 32, 0 -; PWR6-NEXT: sub 4, 4, 8 -; PWR6-NEXT: and 8, 3, 6 -; PWR6-NEXT: rotldi 3, 3, 62 -; PWR6-NEXT: and 3, 3, 6 -; PWR6-NEXT: lis 7, 3855 -; PWR6-NEXT: and 5, 4, 6 -; PWR6-NEXT: rotldi 4, 4, 62 -; PWR6-NEXT: add 3, 8, 3 -; PWR6-NEXT: lis 9, 257 -; PWR6-NEXT: ori 7, 7, 3855 -; PWR6-NEXT: and 4, 4, 6 -; PWR6-NEXT: rldicl 6, 3, 60, 4 -; PWR6-NEXT: ori 9, 9, 257 -; PWR6-NEXT: rldimi 7, 7, 32, 0 -; PWR6-NEXT: add 4, 5, 4 -; PWR6-NEXT: add 3, 3, 6 -; PWR6-NEXT: rldimi 9, 9, 32, 0 -; PWR6-NEXT: rldicl 5, 4, 60, 4 -; PWR6-NEXT: and 3, 3, 7 -; PWR6-NEXT: add 4, 4, 5 -; PWR6-NEXT: mulld 3, 3, 9 -; PWR6-NEXT: and 4, 4, 7 -; PWR6-NEXT: rldicl 3, 3, 8, 56 -; PWR6-NEXT: mulld 4, 4, 9 -; PWR6-NEXT: li 5, 8 -; PWR6-NEXT: subfic 3, 3, 8 -; PWR6-NEXT: rldicl 4, 4, 8, 56 -; PWR6-NEXT: subfe 3, 5, 5 -; PWR6-NEXT: subfic 4, 4, 8 -; PWR6-NEXT: subfe 4, 5, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: subfic 3, 3, 0 +; PWR6-NEXT: subfe 3, 3, 3 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: subfic 4, 4, 0 +; PWR6-NEXT: subfe 4, 4, 4 ; PWR6-NEXT: blr ; ; PWR7-LABEL: ugt_8_v2i64: @@ -13821,91 +12243,81 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) { ; PWR5-LABEL: ult_9_v2i64: ; PWR5: # %bb.0: -; PWR5-NEXT: lis 5, 21845 -; PWR5-NEXT: lis 6, 13107 -; PWR5-NEXT: ori 5, 5, 21845 -; PWR5-NEXT: rotldi 8, 4, 63 -; PWR5-NEXT: rotldi 9, 3, 63 -; PWR5-NEXT: rldimi 5, 5, 32, 0 -; PWR5-NEXT: and 8, 8, 5 -; PWR5-NEXT: and 5, 9, 5 -; PWR5-NEXT: ori 6, 6, 13107 -; PWR5-NEXT: sub 3, 3, 5 -; PWR5-NEXT: rldimi 6, 6, 32, 0 -; PWR5-NEXT: sub 4, 4, 8 -; PWR5-NEXT: and 8, 3, 6 -; PWR5-NEXT: rotldi 3, 3, 62 -; PWR5-NEXT: and 3, 3, 6 -; PWR5-NEXT: lis 7, 3855 -; PWR5-NEXT: and 5, 4, 6 -; PWR5-NEXT: rotldi 4, 4, 62 -; PWR5-NEXT: add 3, 8, 3 -; PWR5-NEXT: lis 9, 257 -; PWR5-NEXT: ori 7, 7, 3855 -; PWR5-NEXT: and 4, 4, 6 -; PWR5-NEXT: rldicl 6, 3, 60, 4 -; PWR5-NEXT: ori 9, 9, 257 -; PWR5-NEXT: rldimi 7, 7, 32, 0 -; PWR5-NEXT: add 4, 5, 4 -; PWR5-NEXT: add 3, 3, 6 -; PWR5-NEXT: rldimi 9, 9, 32, 0 -; PWR5-NEXT: rldicl 5, 4, 60, 4 -; PWR5-NEXT: and 3, 3, 7 -; PWR5-NEXT: add 4, 4, 5 -; PWR5-NEXT: mulld 3, 3, 9 -; PWR5-NEXT: and 4, 4, 7 -; PWR5-NEXT: rldicl 3, 3, 8, 56 -; PWR5-NEXT: li 5, 9 -; PWR5-NEXT: mulld 4, 4, 9 -; PWR5-NEXT: subc 6, 3, 5 -; PWR5-NEXT: rldicl 4, 4, 8, 56 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addi 5, 3, -1 +; PWR5-NEXT: and 3, 3, 5 +; PWR5-NEXT: addi 5, 4, -1 +; PWR5-NEXT: addic 3, 3, -1 ; PWR5-NEXT: subfe 3, 3, 3 -; PWR5-NEXT: subc 5, 4, 5 +; PWR5-NEXT: and 4, 4, 5 +; PWR5-NEXT: addic 4, 4, -1 ; PWR5-NEXT: subfe 4, 4, 4 ; PWR5-NEXT: blr ; ; PWR6-LABEL: ult_9_v2i64: ; PWR6: # %bb.0: -; PWR6-NEXT: lis 5, 21845 -; PWR6-NEXT: lis 6, 13107 -; PWR6-NEXT: ori 5, 5, 21845 -; PWR6-NEXT: rotldi 8, 4, 63 -; PWR6-NEXT: rotldi 9, 3, 63 -; PWR6-NEXT: rldimi 5, 5, 32, 0 -; PWR6-NEXT: and 8, 8, 5 -; PWR6-NEXT: and 5, 9, 5 -; PWR6-NEXT: ori 6, 6, 13107 -; PWR6-NEXT: sub 3, 3, 5 -; PWR6-NEXT: rldimi 6, 6, 32, 0 -; PWR6-NEXT: sub 4, 4, 8 -; PWR6-NEXT: and 8, 3, 6 -; PWR6-NEXT: rotldi 3, 3, 62 -; PWR6-NEXT: and 3, 3, 6 -; PWR6-NEXT: lis 7, 3855 -; PWR6-NEXT: and 5, 4, 6 -; PWR6-NEXT: rotldi 4, 4, 62 -; PWR6-NEXT: add 3, 8, 3 -; PWR6-NEXT: lis 9, 257 -; PWR6-NEXT: ori 7, 7, 3855 -; PWR6-NEXT: and 4, 4, 6 -; PWR6-NEXT: rldicl 6, 3, 60, 4 -; PWR6-NEXT: ori 9, 9, 257 -; PWR6-NEXT: rldimi 7, 7, 32, 0 -; PWR6-NEXT: add 4, 5, 4 -; PWR6-NEXT: add 3, 3, 6 -; PWR6-NEXT: rldimi 9, 9, 32, 0 -; PWR6-NEXT: rldicl 5, 4, 60, 4 -; PWR6-NEXT: and 3, 3, 7 -; PWR6-NEXT: add 4, 4, 5 -; PWR6-NEXT: mulld 3, 3, 9 -; PWR6-NEXT: and 4, 4, 7 -; PWR6-NEXT: rldicl 3, 3, 8, 56 -; PWR6-NEXT: li 5, 9 -; PWR6-NEXT: mulld 4, 4, 9 -; PWR6-NEXT: subc 6, 3, 5 -; PWR6-NEXT: rldicl 4, 4, 8, 56 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addi 5, 3, -1 +; PWR6-NEXT: and 3, 3, 5 +; PWR6-NEXT: addi 5, 4, -1 +; PWR6-NEXT: addic 3, 3, -1 ; PWR6-NEXT: subfe 3, 3, 3 -; PWR6-NEXT: subc 5, 4, 5 +; PWR6-NEXT: and 4, 4, 5 +; PWR6-NEXT: addic 4, 4, -1 ; PWR6-NEXT: subfe 4, 4, 4 ; PWR6-NEXT: blr ; diff --git a/i/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll --- a/i/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll @@ -127,106 +127,42 @@ } define <16 x i8> @ugt_2_v16i8(<16 x i8> %0) { -; SSE2-LABEL: ugt_2_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pcmpgtb {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ugt_2_v16i8: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pcmpgtb {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ugt_2_v16i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; SSSE3-NEXT: pmaxub %xmm3, %xmm0 -; SSSE3-NEXT: pcmpeqb %xmm3, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ugt_2_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; SSE41-NEXT: pmaxub %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqb %xmm3, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ugt_2_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddb %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ugt_2_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmaxub {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_2_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpmaxub {{.*}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_2_v16i8: @@ -268,105 +204,38 @@ } define <16 x i8> @ult_3_v16i8(<16 x i8> %0) { -; SSE2-LABEL: ult_3_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ult_3_v16i8: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; SSE3-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ult_3_v16i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; SSSE3-NEXT: pminub %xmm3, %xmm0 -; SSSE3-NEXT: pcmpeqb %xmm3, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ult_3_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; SSE41-NEXT: pminub %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqb %xmm3, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ult_3_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: paddb %xmm2, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: paddb %xmm1, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ult_3_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpminub {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_3_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpminub {{.*}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -412,106 +281,49 @@ } define <16 x i8> @ugt_3_v16i8(<16 x i8> %0) { -; SSE2-LABEL: ugt_3_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pcmpgtb {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ugt_3_v16i8: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pcmpgtb {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ugt_3_v16i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; SSSE3-NEXT: pmaxub %xmm3, %xmm0 -; SSSE3-NEXT: pcmpeqb %xmm3, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ugt_3_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; SSE41-NEXT: pmaxub %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqb %xmm3, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ugt_3_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddb %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ugt_3_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmaxub {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_3_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpmaxub {{.*}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_3_v16i8: @@ -553,105 +365,44 @@ } define <16 x i8> @ult_4_v16i8(<16 x i8> %0) { -; SSE2-LABEL: ult_4_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ult_4_v16i8: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; SSE3-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ult_4_v16i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; SSSE3-NEXT: pminub %xmm3, %xmm0 -; SSSE3-NEXT: pcmpeqb %xmm3, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ult_4_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; SSE41-NEXT: pminub %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqb %xmm3, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ult_4_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddb %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: paddb %xmm0, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ult_4_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpminub {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_4_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpminub {{.*}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -697,106 +448,56 @@ } define <16 x i8> @ugt_4_v16i8(<16 x i8> %0) { -; SSE2-LABEL: ugt_4_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pcmpgtb {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ugt_4_v16i8: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pcmpgtb {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ugt_4_v16i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] -; SSSE3-NEXT: pmaxub %xmm3, %xmm0 -; SSSE3-NEXT: pcmpeqb %xmm3, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ugt_4_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] -; SSE41-NEXT: pmaxub %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqb %xmm3, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ugt_4_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddb %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddb %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ugt_4_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmaxub {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_4_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpmaxub {{.*}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_4_v16i8: @@ -838,105 +539,51 @@ } define <16 x i8> @ult_5_v16i8(<16 x i8> %0) { -; SSE2-LABEL: ult_5_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] -; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ult_5_v16i8: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] -; SSE3-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ult_5_v16i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; SSSE3-NEXT: pminub %xmm3, %xmm0 -; SSSE3-NEXT: pcmpeqb %xmm3, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ult_5_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; SSE41-NEXT: pminub %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqb %xmm3, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ult_5_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddb %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: paddb %xmm0, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ult_5_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpminub {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_5_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpminub {{.*}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -1667,120 +1314,42 @@ } define <8 x i16> @ugt_2_v8i16(<8 x i16> %0) { -; SSE2-LABEL: ugt_2_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ugt_2_v8i16: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 -; SSE3-NEXT: paddb %xmm1, %xmm0 -; SSE3-NEXT: psrlw $8, %xmm0 -; SSE3-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ugt_2_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: psllw $8, %xmm0 -; SSSE3-NEXT: paddb %xmm3, %xmm0 -; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ugt_2_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: paddb %xmm3, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ugt_2_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ugt_2_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_2_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_2_v8i16: @@ -1822,126 +1391,39 @@ } define <8 x i16> @ult_3_v8i16(<8 x i16> %0) { -; SSE2-LABEL: ult_3_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psllw $8, %xmm2 -; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3,3,3,3,3] -; SSE2-NEXT: pcmpgtw %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ult_3_v8i16: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: psllw $8, %xmm2 -; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: psrlw $8, %xmm2 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3,3,3,3,3] -; SSE3-NEXT: pcmpgtw %xmm2, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ult_3_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: psllw $8, %xmm1 -; SSSE3-NEXT: paddb %xmm3, %xmm1 -; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3,3,3,3,3] -; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ult_3_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm1 -; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3,3,3,3,3] -; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ult_3_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pcmpeqw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ult_3_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_3_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] -; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_3_v8i16: @@ -1986,120 +1468,49 @@ } define <8 x i16> @ugt_3_v8i16(<8 x i16> %0) { -; SSE2-LABEL: ugt_3_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ugt_3_v8i16: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 -; SSE3-NEXT: paddb %xmm1, %xmm0 -; SSE3-NEXT: psrlw $8, %xmm0 -; SSE3-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ugt_3_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: psllw $8, %xmm0 -; SSSE3-NEXT: paddb %xmm3, %xmm0 -; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ugt_3_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: paddb %xmm3, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ugt_3_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: paddw %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ugt_3_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_3_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_3_v8i16: @@ -2141,126 +1552,45 @@ } define <8 x i16> @ult_4_v8i16(<8 x i16> %0) { -; SSE2-LABEL: ult_4_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psllw $8, %xmm2 -; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4,4,4,4,4] -; SSE2-NEXT: pcmpgtw %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ult_4_v8i16: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: psllw $8, %xmm2 -; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: psrlw $8, %xmm2 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4,4,4,4,4] -; SSE3-NEXT: pcmpgtw %xmm2, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ult_4_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: psllw $8, %xmm1 -; SSSE3-NEXT: paddb %xmm3, %xmm1 -; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4,4,4,4,4] -; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ult_4_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm1 -; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4,4,4,4,4] -; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ult_4_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpeqw %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ult_4_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_4_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] -; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_4_v8i16: @@ -2305,120 +1635,56 @@ } define <8 x i16> @ugt_4_v8i16(<8 x i16> %0) { -; SSE2-LABEL: ugt_4_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ugt_4_v8i16: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 -; SSE3-NEXT: paddb %xmm1, %xmm0 -; SSE3-NEXT: psrlw $8, %xmm0 -; SSE3-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ugt_4_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: psllw $8, %xmm0 -; SSSE3-NEXT: paddb %xmm3, %xmm0 -; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ugt_4_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: paddb %xmm3, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ugt_4_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ugt_4_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_4_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_4_v8i16: @@ -2460,126 +1726,52 @@ } define <8 x i16> @ult_5_v8i16(<8 x i16> %0) { -; SSE2-LABEL: ult_5_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psllw $8, %xmm2 -; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [5,5,5,5,5,5,5,5] -; SSE2-NEXT: pcmpgtw %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ult_5_v8i16: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: psllw $8, %xmm2 -; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: psrlw $8, %xmm2 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [5,5,5,5,5,5,5,5] -; SSE3-NEXT: pcmpgtw %xmm2, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ult_5_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: psllw $8, %xmm1 -; SSSE3-NEXT: paddb %xmm3, %xmm1 -; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [5,5,5,5,5,5,5,5] -; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ult_5_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm1 -; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [5,5,5,5,5,5,5,5] -; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ult_5_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: paddw %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpeqw %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ult_5_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_5_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] -; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_5_v8i16: @@ -2624,120 +1816,63 @@ } define <8 x i16> @ugt_5_v8i16(<8 x i16> %0) { -; SSE2-LABEL: ugt_5_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ugt_5_v8i16: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 -; SSE3-NEXT: paddb %xmm1, %xmm0 -; SSE3-NEXT: psrlw $8, %xmm0 -; SSE3-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ugt_5_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: psllw $8, %xmm0 -; SSSE3-NEXT: paddb %xmm3, %xmm0 -; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ugt_5_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: paddb %xmm3, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ugt_5_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: paddw %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ugt_5_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_5_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_5_v8i16: @@ -2779,126 +1914,59 @@ } define <8 x i16> @ult_6_v8i16(<8 x i16> %0) { -; SSE2-LABEL: ult_6_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psllw $8, %xmm2 -; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [6,6,6,6,6,6,6,6] -; SSE2-NEXT: pcmpgtw %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ult_6_v8i16: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: psllw $8, %xmm2 -; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: psrlw $8, %xmm2 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [6,6,6,6,6,6,6,6] -; SSE3-NEXT: pcmpgtw %xmm2, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ult_6_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: psllw $8, %xmm1 -; SSSE3-NEXT: paddb %xmm3, %xmm1 -; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [6,6,6,6,6,6,6,6] -; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ult_6_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm1 -; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [6,6,6,6,6,6,6,6] -; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ult_6_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpeqw %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ult_6_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_6_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] -; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_6_v8i16: @@ -2943,120 +2011,70 @@ } define <8 x i16> @ugt_6_v8i16(<8 x i16> %0) { -; SSE2-LABEL: ugt_6_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psllw $8, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ugt_6_v8i16: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psllw $8, %xmm0 -; SSE3-NEXT: paddb %xmm1, %xmm0 -; SSE3-NEXT: psrlw $8, %xmm0 -; SSE3-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ugt_6_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: psllw $8, %xmm0 -; SSSE3-NEXT: paddb %xmm3, %xmm0 -; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ugt_6_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: paddb %xmm3, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ugt_6_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ugt_6_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_6_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_6_v8i16: @@ -3098,126 +2116,66 @@ } define <8 x i16> @ult_7_v8i16(<8 x i16> %0) { -; SSE2-LABEL: ult_7_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psllw $8, %xmm2 -; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [7,7,7,7,7,7,7,7] -; SSE2-NEXT: pcmpgtw %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ult_7_v8i16: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: psllw $8, %xmm2 -; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: psrlw $8, %xmm2 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [7,7,7,7,7,7,7,7] -; SSE3-NEXT: pcmpgtw %xmm2, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ult_7_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: psllw $8, %xmm1 -; SSSE3-NEXT: paddb %xmm3, %xmm1 -; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [7,7,7,7,7,7,7,7] -; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ult_7_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm1 -; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: paddb %xmm3, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [7,7,7,7,7,7,7,7] -; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ult_7_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddw %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: paddw %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpeqw %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ult_7_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_7_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] -; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_7_v8i16: @@ -5959,141 +4917,42 @@ } define <4 x i32> @ugt_2_v4i32(<4 x i32> %0) { -; SSE2-LABEL: ugt_2_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ugt_2_v4i32: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ugt_2_v4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pshufb %xmm3, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm1 -; SSSE3-NEXT: paddb %xmm4, %xmm1 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: psadbw %xmm0, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: psadbw %xmm0, %xmm1 -; SSSE3-NEXT: packuswb %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ugt_2_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE41-NEXT: psadbw %xmm1, %xmm3 -; SSE41-NEXT: psadbw %xmm1, %xmm0 -; SSE41-NEXT: packuswb %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ugt_2_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ugt_2_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_2_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_2_v4i32: @@ -6148,143 +5007,39 @@ } define <4 x i32> @ult_3_v4i32(<4 x i32> %0) { -; SSE2-LABEL: ult_3_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3] -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ult_3_v4i32: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3] -; SSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ult_3_v4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: psadbw %xmm0, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSSE3-NEXT: psadbw %xmm0, %xmm3 -; SSSE3-NEXT: packuswb %xmm1, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3] -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ult_3_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE41-NEXT: psadbw %xmm0, %xmm3 -; SSE41-NEXT: psadbw %xmm0, %xmm1 -; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3] -; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ult_3_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: paddd %xmm2, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ult_3_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_3_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] -; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_3_v4i32: @@ -6339,141 +5094,49 @@ } define <4 x i32> @ugt_3_v4i32(<4 x i32> %0) { -; SSE2-LABEL: ugt_3_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ugt_3_v4i32: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ugt_3_v4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pshufb %xmm3, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm1 -; SSSE3-NEXT: paddb %xmm4, %xmm1 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: psadbw %xmm0, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: psadbw %xmm0, %xmm1 -; SSSE3-NEXT: packuswb %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ugt_3_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE41-NEXT: psadbw %xmm1, %xmm3 -; SSE41-NEXT: psadbw %xmm1, %xmm0 -; SSE41-NEXT: packuswb %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ugt_3_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: paddd %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ugt_3_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_3_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_3_v4i32: @@ -6528,143 +5191,45 @@ } define <4 x i32> @ult_4_v4i32(<4 x i32> %0) { -; SSE2-LABEL: ult_4_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4] -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ult_4_v4i32: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4] -; SSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ult_4_v4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: psadbw %xmm0, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSSE3-NEXT: psadbw %xmm0, %xmm3 -; SSSE3-NEXT: packuswb %xmm1, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4] -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ult_4_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE41-NEXT: psadbw %xmm0, %xmm3 -; SSE41-NEXT: psadbw %xmm0, %xmm1 -; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4] -; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ult_4_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ult_4_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_4_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4] -; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_4_v4i32: @@ -6719,141 +5284,56 @@ } define <4 x i32> @ugt_4_v4i32(<4 x i32> %0) { -; SSE2-LABEL: ugt_4_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ugt_4_v4i32: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ugt_4_v4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pshufb %xmm3, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm1 -; SSSE3-NEXT: paddb %xmm4, %xmm1 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: psadbw %xmm0, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: psadbw %xmm0, %xmm1 -; SSSE3-NEXT: packuswb %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ugt_4_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE41-NEXT: psadbw %xmm1, %xmm3 -; SSE41-NEXT: psadbw %xmm1, %xmm0 -; SSE41-NEXT: packuswb %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ugt_4_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ugt_4_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_4_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_4_v4i32: @@ -6908,143 +5388,52 @@ } define <4 x i32> @ult_5_v4i32(<4 x i32> %0) { -; SSE2-LABEL: ult_5_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [5,5,5,5] -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ult_5_v4i32: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [5,5,5,5] -; SSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ult_5_v4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: psadbw %xmm0, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSSE3-NEXT: psadbw %xmm0, %xmm3 -; SSSE3-NEXT: packuswb %xmm1, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [5,5,5,5] -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ult_5_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE41-NEXT: psadbw %xmm0, %xmm3 -; SSE41-NEXT: psadbw %xmm0, %xmm1 -; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [5,5,5,5] -; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ult_5_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: paddd %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ult_5_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_5_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5] -; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_5_v4i32: @@ -7099,141 +5488,63 @@ } define <4 x i32> @ugt_5_v4i32(<4 x i32> %0) { -; SSE2-LABEL: ugt_5_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ugt_5_v4i32: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ugt_5_v4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pshufb %xmm3, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm1 -; SSSE3-NEXT: paddb %xmm4, %xmm1 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: psadbw %xmm0, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: psadbw %xmm0, %xmm1 -; SSSE3-NEXT: packuswb %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ugt_5_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE41-NEXT: psadbw %xmm1, %xmm3 -; SSE41-NEXT: psadbw %xmm1, %xmm0 -; SSE41-NEXT: packuswb %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ugt_5_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: paddd %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ugt_5_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_5_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_5_v4i32: @@ -7288,143 +5599,59 @@ } define <4 x i32> @ult_6_v4i32(<4 x i32> %0) { -; SSE2-LABEL: ult_6_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [6,6,6,6] -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ult_6_v4i32: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [6,6,6,6] -; SSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ult_6_v4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: psadbw %xmm0, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSSE3-NEXT: psadbw %xmm0, %xmm3 -; SSSE3-NEXT: packuswb %xmm1, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [6,6,6,6] -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ult_6_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE41-NEXT: psadbw %xmm0, %xmm3 -; SSE41-NEXT: psadbw %xmm0, %xmm1 -; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [6,6,6,6] -; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ult_6_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ult_6_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_6_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,6,6,6] -; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_6_v4i32: @@ -7479,141 +5706,70 @@ } define <4 x i32> @ugt_6_v4i32(<4 x i32> %0) { -; SSE2-LABEL: ugt_6_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ugt_6_v4i32: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: pcmpgtd {{.*}}(%rip), %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ugt_6_v4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pshufb %xmm3, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm1 -; SSSE3-NEXT: paddb %xmm4, %xmm1 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: psadbw %xmm0, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: psadbw %xmm0, %xmm1 -; SSSE3-NEXT: packuswb %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ugt_6_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE41-NEXT: psadbw %xmm1, %xmm3 -; SSE41-NEXT: psadbw %xmm1, %xmm0 -; SSE41-NEXT: packuswb %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ugt_6_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ugt_6_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_6_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,6,6,6] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_6_v4i32: @@ -7668,143 +5824,66 @@ } define <4 x i32> @ult_7_v4i32(<4 x i32> %0) { -; SSE2-LABEL: ult_7_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psadbw %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [7,7,7,7] -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: ult_7_v4i32: -; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE3-NEXT: psadbw %xmm0, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: psadbw %xmm0, %xmm1 -; SSE3-NEXT: packuswb %xmm2, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [7,7,7,7] -; SSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: ult_7_v4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: psadbw %xmm0, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSSE3-NEXT: psadbw %xmm0, %xmm3 -; SSSE3-NEXT: packuswb %xmm1, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [7,7,7,7] -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ult_7_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE41-NEXT: psadbw %xmm0, %xmm3 -; SSE41-NEXT: psadbw %xmm0, %xmm1 -; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [7,7,7,7] -; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ult_7_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: paddd %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: ult_7_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_7_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] -; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_7_v4i32: @@ -17197,144 +15276,88 @@ define <2 x i64> @ugt_2_v2i64(<2 x i64> %0) { ; SSE2-LABEL: ugt_2_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm1, %xmm0 -; SSE2-NEXT: por {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483650,2147483650] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_2_v2i64: ; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm1, %xmm0 -; SSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483650,2147483650] -; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE3-NEXT: pand %xmm3, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: pxor %xmm2, %xmm2 +; SSE3-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_2_v2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: psadbw %xmm3, %xmm0 -; SSSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483650,2147483650] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ugt_2_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: psadbw %xmm3, %xmm0 -; SSE41-NEXT: por {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483650,2147483650] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: ugt_2_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_2_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_2_v2i64: @@ -17381,146 +15404,79 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; SSE2-LABEL: ult_3_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm1, %xmm0 -; SSE2-NEXT: por {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483651,2147483651] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: paddq %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ult_3_v2i64: ; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm1, %xmm0 -; SSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483651,2147483651] -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE3-NEXT: pand %xmm3, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: paddq %xmm2, %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 +; SSE3-NEXT: pxor %xmm2, %xmm2 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ult_3_v2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: psadbw %xmm3, %xmm0 -; SSSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483651,2147483651] -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: paddq %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ult_3_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: paddq %xmm2, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm2, %xmm1 ; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: psadbw %xmm3, %xmm0 -; SSE41-NEXT: por {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483651,2147483651] -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqq %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: ult_3_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_3_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_3_v2i64: @@ -17569,144 +15525,104 @@ define <2 x i64> @ugt_3_v2i64(<2 x i64> %0) { ; SSE2-LABEL: ugt_3_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm1, %xmm0 -; SSE2-NEXT: por {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483651,2147483651] +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_3_v2i64: ; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm1, %xmm0 -; SSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483651,2147483651] +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE3-NEXT: pand %xmm3, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: pxor %xmm3, %xmm3 +; SSE3-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] +; SSE3-NEXT: pand %xmm3, %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_3_v2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: psadbw %xmm3, %xmm0 -; SSSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483651,2147483651] +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ugt_3_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: psadbw %xmm3, %xmm0 -; SSE41-NEXT: por {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483651,2147483651] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddq %xmm1, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: ugt_3_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_3_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_3_v2i64: @@ -17753,146 +15669,94 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; SSE2-LABEL: ult_4_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm1, %xmm0 -; SSE2-NEXT: por {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483652,2147483652] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ult_4_v2i64: ; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm1, %xmm0 -; SSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483652,2147483652] -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE3-NEXT: pand %xmm3, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: paddq %xmm0, %xmm1 +; SSE3-NEXT: pand %xmm0, %xmm1 +; SSE3-NEXT: pxor %xmm2, %xmm2 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ult_4_v2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: psadbw %xmm3, %xmm0 -; SSSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483652,2147483652] -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: paddq %xmm0, %xmm1 +; SSSE3-NEXT: pand %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ult_4_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: paddq %xmm0, %xmm1 ; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: psadbw %xmm3, %xmm0 -; SSE41-NEXT: por {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483652,2147483652] -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: ult_4_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_4_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_4_v2i64: @@ -17941,144 +15805,120 @@ define <2 x i64> @ugt_4_v2i64(<2 x i64> %0) { ; SSE2-LABEL: ugt_4_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm1, %xmm0 -; SSE2-NEXT: por {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483652,2147483652] +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_4_v2i64: ; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm1, %xmm0 -; SSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483652,2147483652] +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE3-NEXT: pand %xmm3, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: pxor %xmm2, %xmm2 +; SSE3-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_4_v2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: psadbw %xmm3, %xmm0 -; SSSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483652,2147483652] +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ugt_4_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: psadbw %xmm3, %xmm0 -; SSE41-NEXT: por {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483652,2147483652] +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: ugt_4_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_4_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_4_v2i64: @@ -18125,146 +15965,110 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; SSE2-LABEL: ult_5_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm1, %xmm0 -; SSE2-NEXT: por {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483653,2147483653] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: paddq %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ult_5_v2i64: ; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm1, %xmm0 -; SSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483653,2147483653] -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE3-NEXT: pand %xmm3, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: paddq %xmm2, %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 +; SSE3-NEXT: pxor %xmm2, %xmm2 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ult_5_v2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: psadbw %xmm3, %xmm0 -; SSSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483653,2147483653] -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: paddq %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ult_5_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddq %xmm1, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: paddq %xmm0, %xmm1 ; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: psadbw %xmm3, %xmm0 -; SSE41-NEXT: por {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483653,2147483653] -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: ult_5_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_5_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_5_v2i64: @@ -18313,144 +16117,136 @@ define <2 x i64> @ugt_5_v2i64(<2 x i64> %0) { ; SSE2-LABEL: ugt_5_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm1, %xmm0 -; SSE2-NEXT: por {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483653,2147483653] +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_5_v2i64: ; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm1, %xmm0 -; SSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483653,2147483653] +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE3-NEXT: pand %xmm3, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: pxor %xmm3, %xmm3 +; SSE3-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] +; SSE3-NEXT: pand %xmm3, %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_5_v2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: psadbw %xmm3, %xmm0 -; SSSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483653,2147483653] +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ugt_5_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: psadbw %xmm3, %xmm0 -; SSE41-NEXT: por {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483653,2147483653] +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddq %xmm1, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: ugt_5_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_5_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_5_v2i64: @@ -18497,146 +16293,126 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; SSE2-LABEL: ult_6_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm1, %xmm0 -; SSE2-NEXT: por {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483654,2147483654] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ult_6_v2i64: ; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm1, %xmm0 -; SSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483654,2147483654] -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE3-NEXT: pand %xmm3, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: paddq %xmm0, %xmm1 +; SSE3-NEXT: pand %xmm0, %xmm1 +; SSE3-NEXT: pxor %xmm2, %xmm2 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ult_6_v2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: psadbw %xmm3, %xmm0 -; SSSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483654,2147483654] -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: paddq %xmm0, %xmm1 +; SSSE3-NEXT: pand %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ult_6_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: paddq %xmm0, %xmm1 ; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: psadbw %xmm3, %xmm0 -; SSE41-NEXT: por {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483654,2147483654] -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: ult_6_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_6_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_6_v2i64: @@ -18685,144 +16461,152 @@ define <2 x i64> @ugt_6_v2i64(<2 x i64> %0) { ; SSE2-LABEL: ugt_6_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm1, %xmm0 -; SSE2-NEXT: por {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483654,2147483654] +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ugt_6_v2i64: ; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm1, %xmm0 -; SSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483654,2147483654] +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE3-NEXT: pand %xmm3, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: pxor %xmm2, %xmm2 +; SSE3-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ugt_6_v2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: psadbw %xmm3, %xmm0 -; SSSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483654,2147483654] +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ugt_6_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: psadbw %xmm3, %xmm0 -; SSE41-NEXT: por {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483654,2147483654] +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: ugt_6_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_6_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_6_v2i64: @@ -18869,146 +16653,142 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; SSE2-LABEL: ult_7_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm1, %xmm0 -; SSE2-NEXT: por {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483655,2147483655] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: paddq %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: ult_7_v2i64: ; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: psadbw %xmm1, %xmm0 -; SSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483655,2147483655] -; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE3-NEXT: pand %xmm3, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE3-NEXT: por %xmm1, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: paddq %xmm1, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 +; SSE3-NEXT: paddq %xmm2, %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 +; SSE3-NEXT: pxor %xmm2, %xmm2 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: ult_7_v2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pshufb %xmm2, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 -; SSSE3-NEXT: paddb %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: psadbw %xmm3, %xmm0 -; SSSE3-NEXT: por {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483655,2147483655] -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: paddq %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: paddq %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: ult_7_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pshufb %xmm2, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: paddq %xmm1, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: paddq %xmm0, %xmm1 ; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: psadbw %xmm3, %xmm0 -; SSE41-NEXT: por {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483655,2147483655] -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: ult_7_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_7_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_7_v2i64: diff --git a/i/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll --- a/i/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll @@ -139,41 +139,34 @@ define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ugt_2_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpmaxub %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_2_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxub {{.*}}(%rip), %ymm0, %ymm1 -; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_2_v32i8: @@ -226,40 +219,30 @@ define <32 x i8> @ult_3_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ult_3_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpminub %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_3_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminub {{.*}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -339,16 +322,16 @@ ; ; AVX2-LABEL: ugt_3_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxub {{.*}}(%rip), %ymm0, %ymm1 -; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_3_v32i8: @@ -426,15 +409,14 @@ ; ; AVX2-LABEL: ult_4_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminub {{.*}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -514,16 +496,18 @@ ; ; AVX2-LABEL: ugt_4_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxub {{.*}}(%rip), %ymm0, %ymm1 -; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_4_v32i8: @@ -601,15 +585,16 @@ ; ; AVX2-LABEL: ult_5_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminub {{.*}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -1143,47 +1128,34 @@ define <16 x i16> @ugt_2_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_2_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_2_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_2_v16i16: @@ -1224,48 +1196,31 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_3_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_3_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_3_v16i16: @@ -1309,47 +1264,40 @@ define <16 x i16> @ugt_3_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_3_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_3_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_3_v16i16: @@ -1390,48 +1338,37 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_4_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_4_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_4_v16i16: @@ -1475,47 +1412,46 @@ define <16 x i16> @ugt_4_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_4_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_4_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_4_v16i16: @@ -1556,48 +1492,43 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_5_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_5_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] -; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_5_v16i16: @@ -1670,18 +1601,20 @@ ; ; AVX2-LABEL: ugt_5_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_5_v16i16: @@ -1751,19 +1684,19 @@ ; ; AVX2-LABEL: ult_6_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] -; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_6_v16i16: @@ -1836,18 +1769,22 @@ ; ; AVX2-LABEL: ugt_6_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_6_v16i16: @@ -1917,19 +1854,21 @@ ; ; AVX2-LABEL: ult_7_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_7_v16i16: @@ -3433,56 +3372,34 @@ define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_2_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_2_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_2_v8i32: @@ -3537,56 +3454,31 @@ define <8 x i32> @ult_3_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_3_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_3_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3] -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_3_v8i32: @@ -3641,56 +3533,40 @@ define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_3_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_3_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_3_v8i32: @@ -3745,56 +3621,37 @@ define <8 x i32> @ult_4_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_4_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_4_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_4_v8i32: @@ -3849,56 +3706,46 @@ define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_4_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4] -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_4_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_4_v8i32: @@ -3953,56 +3800,43 @@ define <8 x i32> @ult_5_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_5_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5] -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_5_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5] -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_5_v8i32: @@ -4091,22 +3925,20 @@ ; ; AVX2-LABEL: ugt_5_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_5_v8i32: @@ -4195,22 +4027,19 @@ ; ; AVX2-LABEL: ult_6_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6] -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_6_v8i32: @@ -4299,22 +4128,22 @@ ; ; AVX2-LABEL: ugt_6_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_6_v8i32: @@ -4403,22 +4232,21 @@ ; ; AVX2-LABEL: ult_7_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] -; AVX2-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_7_v8i32: @@ -9597,44 +9425,34 @@ define <4 x i64> @ugt_2_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_2_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_2_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2,2,2,2] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_2_v4i64: @@ -9681,44 +9499,31 @@ define <4 x i64> @ult_3_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_3_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_3_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_3_v4i64: @@ -9765,44 +9570,40 @@ define <4 x i64> @ugt_3_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_3_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_3_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_3_v4i64: @@ -9849,44 +9650,37 @@ define <4 x i64> @ult_4_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_4_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_4_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_4_v4i64: @@ -9933,44 +9727,46 @@ define <4 x i64> @ugt_4_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_4_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_4_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_4_v4i64: @@ -10017,44 +9813,43 @@ define <4 x i64> @ult_5_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_5_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_5_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,5,5,5] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_5_v4i64: @@ -10127,18 +9922,20 @@ ; ; AVX2-LABEL: ugt_5_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,5,5,5] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_5_v4i64: @@ -10211,18 +10008,19 @@ ; ; AVX2-LABEL: ult_6_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_6_v4i64: @@ -10295,18 +10093,22 @@ ; ; AVX2-LABEL: ugt_6_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_6_v4i64: @@ -10379,18 +10181,21 @@ ; ; AVX2-LABEL: ult_7_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [7,7,7,7] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_7_v4i64: diff --git a/i/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll --- a/i/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll @@ -125,40 +125,30 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ugt_2_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm4 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpmaxub %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vpcmpeqb %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ugt_2_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleub {{.*}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestmb %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -216,40 +206,29 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ult_3_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm4 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpminub %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vpcmpeqb %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ult_3_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltub {{.*}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -332,15 +311,13 @@ ; ; AVX512BW-LABEL: ugt_3_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleub {{.*}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestmb %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -423,15 +400,13 @@ ; ; AVX512BW-LABEL: ult_4_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltub {{.*}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -514,15 +489,15 @@ ; ; AVX512BW-LABEL: ugt_4_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleub {{.*}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestmb %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -605,15 +580,15 @@ ; ; AVX512BW-LABEL: ult_5_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltub {{.*}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1151,47 +1126,30 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_2_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsllw $8, %ymm2, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm4 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ugt_2_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuw {{.*}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1242,47 +1200,29 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_3_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsllw $8, %ymm2, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm4 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ult_3_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuw {{.*}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1333,47 +1273,36 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_3_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsllw $8, %ymm2, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm4 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ugt_3_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuw {{.*}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1424,47 +1353,35 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_4_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsllw $8, %ymm2, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm4 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ult_4_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuw {{.*}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1515,47 +1432,42 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_4_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsllw $8, %ymm2, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm4 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ugt_4_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuw {{.*}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1606,47 +1518,41 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_5_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsllw $8, %ymm2, %ymm4 -; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm4 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] -; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ult_5_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuw {{.*}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1726,18 +1632,17 @@ ; ; AVX512BW-LABEL: ugt_5_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuw {{.*}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1817,18 +1722,17 @@ ; ; AVX512BW-LABEL: ult_6_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuw {{.*}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1908,18 +1812,19 @@ ; ; AVX512BW-LABEL: ugt_6_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuw {{.*}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1999,18 +1904,19 @@ ; ; AVX512BW-LABEL: ult_7_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuw {{.*}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -3591,54 +3497,21 @@ define <16 x i32> @ugt_2_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_2_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ugt_2_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestmd %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -3670,54 +3543,21 @@ define <16 x i32> @ult_3_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_3_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ult_3_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestnmd %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -3749,54 +3589,25 @@ define <16 x i32> @ugt_3_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_3_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ugt_3_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestmd %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -3828,54 +3639,25 @@ define <16 x i32> @ult_4_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_4_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ult_4_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestnmd %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -3907,54 +3689,29 @@ define <16 x i32> @ugt_4_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_4_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ugt_4_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestmd %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -3986,54 +3743,29 @@ define <16 x i32> @ult_5_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_5_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ult_5_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestnmd %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -4065,54 +3797,33 @@ define <16 x i32> @ugt_5_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_5_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ugt_5_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestmd %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -4144,54 +3855,33 @@ define <16 x i32> @ult_6_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_6_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ult_6_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestnmd %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -4223,54 +3913,37 @@ define <16 x i32> @ugt_6_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_6_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ugt_6_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestmd %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -4302,54 +3975,37 @@ define <16 x i32> @ult_7_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_7_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ult_7_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestnmd %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -4381,34 +4037,21 @@ define <16 x i32> @ugt_7_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_7_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -4460,34 +4103,21 @@ define <16 x i32> @ult_8_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_8_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -4539,34 +4169,23 @@ define <16 x i32> @ugt_8_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_8_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -4618,34 +4237,23 @@ define <16 x i32> @ult_9_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_9_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -4697,34 +4305,25 @@ define <16 x i32> @ugt_9_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_9_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -4776,34 +4375,25 @@ define <16 x i32> @ult_10_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_10_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -4855,34 +4445,27 @@ define <16 x i32> @ugt_10_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_10_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -4934,34 +4517,27 @@ define <16 x i32> @ult_11_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_11_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -5013,34 +4589,29 @@ define <16 x i32> @ugt_11_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_11_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -5092,34 +4663,29 @@ define <16 x i32> @ult_12_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_12_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -5171,34 +4737,31 @@ define <16 x i32> @ugt_12_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_12_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -5250,34 +4813,31 @@ define <16 x i32> @ult_13_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_13_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -5329,34 +4889,33 @@ define <16 x i32> @ugt_13_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_13_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -5408,34 +4967,33 @@ define <16 x i32> @ult_14_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_14_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -5487,34 +5045,35 @@ define <16 x i32> @ugt_14_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_14_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -5566,34 +5125,35 @@ define <16 x i32> @ult_15_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_15_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -8247,42 +7807,21 @@ define <8 x i64> @ugt_2_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_2_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ugt_2_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestmq %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -8310,42 +7849,21 @@ define <8 x i64> @ult_3_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_3_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ult_3_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestnmq %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -8373,42 +7891,25 @@ define <8 x i64> @ugt_3_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_3_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ugt_3_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestmq %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -8436,42 +7937,25 @@ define <8 x i64> @ult_4_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_4_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ult_4_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestnmq %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -8499,42 +7983,29 @@ define <8 x i64> @ugt_4_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_4_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ugt_4_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestmq %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -8562,42 +8033,29 @@ define <8 x i64> @ult_5_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_5_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ult_5_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestnmq %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -8625,42 +8083,33 @@ define <8 x i64> @ugt_5_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_5_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ugt_5_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestmq %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -8688,42 +8137,33 @@ define <8 x i64> @ult_6_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_6_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ult_6_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestnmq %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -8751,42 +8191,37 @@ define <8 x i64> @ugt_6_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_6_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ugt_6_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestmq %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -8814,42 +8249,37 @@ define <8 x i64> @ult_7_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_7_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: ult_7_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vptestnmq %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -8877,26 +8307,21 @@ define <8 x i64> @ugt_7_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_7_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -8940,26 +8365,21 @@ define <8 x i64> @ult_8_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_8_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -9003,26 +8423,23 @@ define <8 x i64> @ugt_8_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_8_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -9066,26 +8483,23 @@ define <8 x i64> @ult_9_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_9_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -9129,26 +8543,25 @@ define <8 x i64> @ugt_9_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_9_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -9192,26 +8605,25 @@ define <8 x i64> @ult_10_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_10_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -9255,26 +8667,27 @@ define <8 x i64> @ugt_10_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_10_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpnleuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; @@ -9318,26 +8731,27 @@ define <8 x i64> @ult_11_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_11_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpltuq {{.*}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vptestnmq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ;