Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -34506,6 +34506,47 @@ return R.getValue(1); } +// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant +// operands and the result of CMOV is not used anywhere else - promote CMOV +// itself instead of promoting its result. This could be beneficial, because: +// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two +// (or more) pseudo-CMOVs only when they go one-after-another and +// getting rid of result extension code after CMOV will help that. +// 2) Promotion of constant CMOV arguments is free, hence the +// {ANY,SIGN,ZERO}_EXTEND will just be deleted. +// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this +// promotion is also good in terms of code-size. +// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit +// promotion). +static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) { + SDValue CMovN = Extend->getOperand(0); + if (CMovN.getOpcode() != X86ISD::CMOV) + return SDValue(); + + EVT TargetVT = Extend->getValueType(0); + unsigned ExtendOpcode = Extend->getOpcode(); + SDLoc DL(Extend); + + EVT VT = CMovN.getValueType(); + SDValue CMovOp0 = CMovN.getOperand(0); + SDValue CMovOp1 = CMovN.getOperand(1); + + bool DoPromoteCMOV = + (VT == MVT::i16 && (TargetVT == MVT::i32 || TargetVT == MVT::i64)) && + CMovN.hasOneUse() && + (isa(CMovOp0.getNode()) && + isa(CMovOp1.getNode())); + + if (!DoPromoteCMOV) + return SDValue(); + + CMovOp0 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp0); + CMovOp1 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp1); + + return DAG.getNode(X86ISD::CMOV, DL, TargetVT, CMovOp0, CMovOp1, + CMovN.getOperand(2), CMovN.getOperand(3)); +} + /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating /// with UNDEFs) of the input to vectors of the same size as the target type @@ -34620,6 +34661,9 @@ if (SDValue DivRem8 = getDivRem8(N, DAG)) return DivRem8; + if (SDValue NewCMov = combineToExtendCMOV(N, DAG)) + return NewCMov; + if (!DCI.isBeforeLegalizeOps()) { if (InVT == MVT::i1) { SDValue Zero = DAG.getConstant(0, DL, VT); @@ -34772,6 +34816,9 @@ } } + if (SDValue NewCMov = combineToExtendCMOV(N, DAG)) + return NewCMov; + if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) return V; Index: test/CodeGen/X86/cmov-promotion.ll =================================================================== --- test/CodeGen/X86/cmov-promotion.ll +++ test/CodeGen/X86/cmov-promotion.ll @@ -90,21 +90,19 @@ ; CMOV-LABEL: cmov_zpromotion_16_to_32: ; CMOV: # BB#0: ; CMOV-NEXT: testb $1, %dil -; CMOV-NEXT: movw $12414, %ax # imm = 0x307E -; CMOV-NEXT: movw $-1, %cx -; CMOV-NEXT: cmovnew %ax, %cx -; CMOV-NEXT: movzwl %cx, %eax +; CMOV-NEXT: movl $12414, %ecx # imm = 0x307E +; CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF +; CMOV-NEXT: cmovnel %ecx, %eax ; CMOV-NEXT: retq ; ; NO_CMOV-LABEL: cmov_zpromotion_16_to_32: ; NO_CMOV: # BB#0: ; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; NO_CMOV-NEXT: movw $12414, %ax # imm = 0x307E +; NO_CMOV-NEXT: movl $12414, %eax # imm = 0x307E ; NO_CMOV-NEXT: jne .LBB3_2 ; NO_CMOV-NEXT: # BB#1: -; NO_CMOV-NEXT: movw $-1, %ax +; NO_CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF ; NO_CMOV-NEXT: .LBB3_2: -; NO_CMOV-NEXT: movzwl %ax, %eax ; NO_CMOV-NEXT: retl %t0 = select i1 %c, i16 12414, i16 -1 %ret = zext i16 %t0 to i32 @@ -115,21 +113,19 @@ ; CMOV-LABEL: cmov_zpromotion_16_to_64: ; CMOV: # BB#0: ; CMOV-NEXT: testb $1, %dil -; CMOV-NEXT: movw $12414, %ax # imm = 0x307E -; CMOV-NEXT: movw $-1, %cx -; CMOV-NEXT: cmovnew %ax, %cx -; CMOV-NEXT: movzwl %cx, %eax +; CMOV-NEXT: movl $12414, %ecx # imm = 0x307E +; CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF +; CMOV-NEXT: cmovneq %rcx, %rax ; CMOV-NEXT: retq ; ; NO_CMOV-LABEL: cmov_zpromotion_16_to_64: ; NO_CMOV: # BB#0: ; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; NO_CMOV-NEXT: movw $12414, %ax # imm = 0x307E +; NO_CMOV-NEXT: movl $12414, %eax # imm = 0x307E ; NO_CMOV-NEXT: jne .LBB4_2 ; NO_CMOV-NEXT: # BB#1: -; NO_CMOV-NEXT: movw $-1, %ax +; NO_CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF ; NO_CMOV-NEXT: .LBB4_2: -; NO_CMOV-NEXT: movzwl %ax, %eax ; NO_CMOV-NEXT: xorl %edx, %edx ; NO_CMOV-NEXT: retl %t0 = select i1 %c, i16 12414, i16 -1 @@ -250,21 +246,19 @@ ; CMOV-LABEL: cmov_spromotion_16_to_32: ; CMOV: # BB#0: ; CMOV-NEXT: testb $1, %dil -; CMOV-NEXT: movw $12414, %ax # imm = 0x307E -; CMOV-NEXT: movw $-1, %cx -; CMOV-NEXT: cmovnew %ax, %cx -; CMOV-NEXT: movswl %cx, %eax +; CMOV-NEXT: movl $12414, %ecx # imm = 0x307E +; CMOV-NEXT: movl $-1, %eax +; CMOV-NEXT: cmovnel %ecx, %eax ; CMOV-NEXT: retq ; ; NO_CMOV-LABEL: cmov_spromotion_16_to_32: ; NO_CMOV: # BB#0: ; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; NO_CMOV-NEXT: movw $12414, %ax # imm = 0x307E +; NO_CMOV-NEXT: movl $12414, %eax # imm = 0x307E ; NO_CMOV-NEXT: jne .LBB9_2 ; NO_CMOV-NEXT: # BB#1: -; NO_CMOV-NEXT: movw $-1, %ax +; NO_CMOV-NEXT: movl $-1, %eax ; NO_CMOV-NEXT: .LBB9_2: -; NO_CMOV-NEXT: cwtl ; NO_CMOV-NEXT: retl %t0 = select i1 %c, i16 12414, i16 -1 %ret = sext i16 %t0 to i32 @@ -275,21 +269,19 @@ ; CMOV-LABEL: cmov_spromotion_16_to_64: ; CMOV: # BB#0: ; CMOV-NEXT: testb $1, %dil -; CMOV-NEXT: movw $12414, %ax # imm = 0x307E -; CMOV-NEXT: movw $-1, %cx -; CMOV-NEXT: cmovnew %ax, %cx -; CMOV-NEXT: movswq %cx, %rax +; CMOV-NEXT: movl $12414, %ecx # imm = 0x307E +; CMOV-NEXT: movq $-1, %rax +; CMOV-NEXT: cmovneq %rcx, %rax ; CMOV-NEXT: retq ; ; NO_CMOV-LABEL: cmov_spromotion_16_to_64: ; NO_CMOV: # BB#0: ; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; NO_CMOV-NEXT: movw $12414, %ax # imm = 0x307E +; NO_CMOV-NEXT: movl $12414, %eax # imm = 0x307E ; NO_CMOV-NEXT: jne .LBB10_2 ; NO_CMOV-NEXT: # BB#1: -; NO_CMOV-NEXT: movw $-1, %ax +; NO_CMOV-NEXT: movl $-1, %eax ; NO_CMOV-NEXT: .LBB10_2: -; NO_CMOV-NEXT: cwtl ; NO_CMOV-NEXT: movl %eax, %edx ; NO_CMOV-NEXT: sarl $31, %edx ; NO_CMOV-NEXT: retl Index: test/CodeGen/X86/select.ll =================================================================== --- test/CodeGen/X86/select.ll +++ test/CodeGen/X86/select.ll @@ -39,44 +39,58 @@ ; PR2139 define i32 @test2() nounwind { -; CHECK-LABEL: test2: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq _return_false -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: movw $-480, %ax ## imm = 0xFE20 -; CHECK-NEXT: cmovnew %cx, %ax -; CHECK-NEXT: cwtl -; CHECK-NEXT: shll $3, %eax -; CHECK-NEXT: cmpl $32768, %eax ## imm = 0x8000 -; CHECK-NEXT: jge LBB1_1 -; CHECK-NEXT: ## BB#2: ## %bb91 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: popq %rcx -; CHECK-NEXT: retq -; CHECK-NEXT: LBB1_1: ## %bb90 -; CHECK-NEXT: ## -- End function +; GENERIC-LABEL: test2: +; GENERIC: ## BB#0: ## %entry +; GENERIC-NEXT: pushq %rax +; GENERIC-NEXT: callq _return_false +; GENERIC-NEXT: xorl %ecx, %ecx +; GENERIC-NEXT: testb $1, %al +; GENERIC-NEXT: movl $-480, %eax +; GENERIC-NEXT: cmovnel %ecx, %eax +; GENERIC-NEXT: shll $3, %eax +; GENERIC-NEXT: cmpl $32768, %eax ## imm = 0x8000 +; GENERIC-NEXT: jge LBB1_1 +; GENERIC-NEXT: ## BB#2: ## %bb91 +; GENERIC-NEXT: xorl %eax, %eax +; GENERIC-NEXT: popq %rcx +; GENERIC-NEXT: retq +; GENERIC-NEXT: LBB1_1: ## %bb90 +; GENERIC-NEXT: ## -- End function +; +; ATOM-LABEL: test2: +; ATOM: ## BB#0: ## %entry +; ATOM-NEXT: pushq %rax +; ATOM-NEXT: callq _return_false +; ATOM-NEXT: xorl %ecx, %ecx +; ATOM-NEXT: movl $-480, %edx +; ATOM-NEXT: testb $1, %al +; ATOM-NEXT: cmovnel %ecx, %edx +; ATOM-NEXT: shll $3, %edx +; ATOM-NEXT: cmpl $32768, %edx ## imm = 0x8000 +; ATOM-NEXT: jge LBB1_1 +; ATOM-NEXT: ## BB#2: ## %bb91 +; ATOM-NEXT: xorl %eax, %eax +; ATOM-NEXT: popq %rcx +; ATOM-NEXT: retq +; ATOM-NEXT: LBB1_1: ## %bb90 +; ATOM-NEXT: ## -- End function ; ; MCU-LABEL: test2: ; MCU: # BB#0: # %entry ; MCU-NEXT: calll return_false +; MCU-NEXT: xorl %ecx, %ecx ; MCU-NEXT: testb $1, %al -; MCU-NEXT: jne .LBB1_1 -; MCU-NEXT: # BB#2: # %entry -; MCU-NEXT: movw $-480, %ax # imm = 0xFE20 -; MCU-NEXT: jmp .LBB1_3 -; MCU-NEXT: .LBB1_1: -; MCU-NEXT: xorl %eax, %eax -; MCU-NEXT: .LBB1_3: # %entry -; MCU-NEXT: cwtl -; MCU-NEXT: shll $3, %eax -; MCU-NEXT: cmpl $32768, %eax # imm = 0x8000 -; MCU-NEXT: jge .LBB1_4 -; MCU-NEXT: # BB#5: # %bb91 +; MCU-NEXT: jne .LBB1_2 +; MCU-NEXT: # BB#1: # %entry +; MCU-NEXT: movl $-480, %ecx # imm = 0xFE20 +; MCU-NEXT: .LBB1_2: +; MCU-NEXT: shll $3, %ecx +; MCU-NEXT: cmpl $32768, %ecx # imm = 0x8000 +; MCU-NEXT: jge .LBB1_3 +; MCU-NEXT: # BB#4: # %bb91 ; MCU-NEXT: xorl %eax, %eax ; MCU-NEXT: retl -; MCU-NEXT: .LBB1_4: # %bb90 +; MCU-NEXT: .LBB1_3: # %bb90 entry: %tmp73 = tail call i1 @return_false() %g.0 = select i1 %tmp73, i16 0, i16 -480 Index: test/CodeGen/X86/vector-compare-results.ll =================================================================== --- test/CodeGen/X86/vector-compare-results.ll +++ test/CodeGen/X86/vector-compare-results.ll @@ -5849,51 +5849,51 @@ ; AVX512BW-NEXT: vpextrw $1, %xmm5, %edx ; AVX512BW-NEXT: xorl %eax, %eax ; AVX512BW-NEXT: cmpw %cx, %dx -; AVX512BW-NEXT: movw $-1, %cx +; AVX512BW-NEXT: movl $65535, %ecx # imm = 0xFFFF ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vmovd %xmm4, %esi ; AVX512BW-NEXT: vmovd %xmm5, %edi ; AVX512BW-NEXT: cmpw %si, %di ; AVX512BW-NEXT: movl $0, %esi -; AVX512BW-NEXT: cmovgw %cx, %si +; AVX512BW-NEXT: cmovgl %ecx, %esi ; AVX512BW-NEXT: vmovd %esi, %xmm6 ; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $2, %xmm4, %edx ; AVX512BW-NEXT: vpextrw $2, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $3, %xmm4, %edx ; AVX512BW-NEXT: vpextrw $3, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $4, %xmm4, %edx ; AVX512BW-NEXT: vpextrw $4, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $5, %xmm4, %edx ; AVX512BW-NEXT: vpextrw $5, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $6, %xmm4, %edx ; AVX512BW-NEXT: vpextrw $6, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $7, %xmm4, %edx ; AVX512BW-NEXT: vpextrw $7, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm6, %xmm4 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm5 ; AVX512BW-NEXT: vpextrw $1, %xmm5, %edx @@ -5901,49 +5901,49 @@ ; AVX512BW-NEXT: vpextrw $1, %xmm6, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vmovd %xmm5, %esi ; AVX512BW-NEXT: vmovd %xmm6, %edi ; AVX512BW-NEXT: cmpw %si, %di ; AVX512BW-NEXT: movl $0, %esi -; AVX512BW-NEXT: cmovgw %cx, %si +; AVX512BW-NEXT: cmovgl %ecx, %esi ; AVX512BW-NEXT: vmovd %esi, %xmm7 ; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm7, %xmm7 ; AVX512BW-NEXT: vpextrw $2, %xmm5, %edx ; AVX512BW-NEXT: vpextrw $2, %xmm6, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm7, %xmm7 ; AVX512BW-NEXT: vpextrw $3, %xmm5, %edx ; AVX512BW-NEXT: vpextrw $3, %xmm6, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm7, %xmm7 ; AVX512BW-NEXT: vpextrw $4, %xmm5, %edx ; AVX512BW-NEXT: vpextrw $4, %xmm6, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm7, %xmm7 ; AVX512BW-NEXT: vpextrw $5, %xmm5, %edx ; AVX512BW-NEXT: vpextrw $5, %xmm6, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm7, %xmm7 ; AVX512BW-NEXT: vpextrw $6, %xmm5, %edx ; AVX512BW-NEXT: vpextrw $6, %xmm6, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm7, %xmm7 ; AVX512BW-NEXT: vpextrw $7, %xmm5, %edx ; AVX512BW-NEXT: vpextrw $7, %xmm6, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm7, %xmm5 ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm5 @@ -5952,97 +5952,97 @@ ; AVX512BW-NEXT: vpextrw $1, %xmm6, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vmovd %xmm5, %esi ; AVX512BW-NEXT: vmovd %xmm6, %edi ; AVX512BW-NEXT: cmpw %si, %di ; AVX512BW-NEXT: movl $0, %esi -; AVX512BW-NEXT: cmovgw %cx, %si +; AVX512BW-NEXT: cmovgl %ecx, %esi ; AVX512BW-NEXT: vmovd %esi, %xmm7 ; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm7, %xmm7 ; AVX512BW-NEXT: vpextrw $2, %xmm5, %edx ; AVX512BW-NEXT: vpextrw $2, %xmm6, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm7, %xmm7 ; AVX512BW-NEXT: vpextrw $3, %xmm5, %edx ; AVX512BW-NEXT: vpextrw $3, %xmm6, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm7, %xmm7 ; AVX512BW-NEXT: vpextrw $4, %xmm5, %edx ; AVX512BW-NEXT: vpextrw $4, %xmm6, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm7, %xmm7 ; AVX512BW-NEXT: vpextrw $5, %xmm5, %edx ; AVX512BW-NEXT: vpextrw $5, %xmm6, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm7, %xmm7 ; AVX512BW-NEXT: vpextrw $6, %xmm5, %edx ; AVX512BW-NEXT: vpextrw $6, %xmm6, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm7, %xmm7 ; AVX512BW-NEXT: vpextrw $7, %xmm5, %edx ; AVX512BW-NEXT: vpextrw $7, %xmm6, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm7, %xmm5 ; AVX512BW-NEXT: vpextrw $1, %xmm2, %edx ; AVX512BW-NEXT: vpextrw $1, %xmm0, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vmovd %xmm2, %esi ; AVX512BW-NEXT: vmovd %xmm0, %edi ; AVX512BW-NEXT: cmpw %si, %di ; AVX512BW-NEXT: movl $0, %esi -; AVX512BW-NEXT: cmovgw %cx, %si +; AVX512BW-NEXT: cmovgl %ecx, %esi ; AVX512BW-NEXT: vmovd %esi, %xmm6 ; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $2, %xmm2, %edx ; AVX512BW-NEXT: vpextrw $2, %xmm0, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $3, %xmm2, %edx ; AVX512BW-NEXT: vpextrw $3, %xmm0, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $4, %xmm2, %edx ; AVX512BW-NEXT: vpextrw $4, %xmm0, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $5, %xmm2, %edx ; AVX512BW-NEXT: vpextrw $5, %xmm0, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $6, %xmm2, %edx ; AVX512BW-NEXT: vpextrw $6, %xmm0, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $7, %xmm2, %edx ; AVX512BW-NEXT: vpextrw $7, %xmm0, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm6, %xmm0 ; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 @@ -6053,49 +6053,49 @@ ; AVX512BW-NEXT: vpextrw $1, %xmm4, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vmovd %xmm2, %esi ; AVX512BW-NEXT: vmovd %xmm4, %edi ; AVX512BW-NEXT: cmpw %si, %di ; AVX512BW-NEXT: movl $0, %esi -; AVX512BW-NEXT: cmovgw %cx, %si +; AVX512BW-NEXT: cmovgl %ecx, %esi ; AVX512BW-NEXT: vmovd %esi, %xmm5 ; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrw $2, %xmm2, %edx ; AVX512BW-NEXT: vpextrw $2, %xmm4, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrw $3, %xmm2, %edx ; AVX512BW-NEXT: vpextrw $3, %xmm4, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrw $4, %xmm2, %edx ; AVX512BW-NEXT: vpextrw $4, %xmm4, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrw $5, %xmm2, %edx ; AVX512BW-NEXT: vpextrw $5, %xmm4, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrw $6, %xmm2, %edx ; AVX512BW-NEXT: vpextrw $6, %xmm4, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrw $7, %xmm2, %edx ; AVX512BW-NEXT: vpextrw $7, %xmm4, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm5, %xmm2 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm4 ; AVX512BW-NEXT: vpextrw $1, %xmm4, %edx @@ -6103,49 +6103,49 @@ ; AVX512BW-NEXT: vpextrw $1, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vmovd %xmm4, %esi ; AVX512BW-NEXT: vmovd %xmm5, %edi ; AVX512BW-NEXT: cmpw %si, %di ; AVX512BW-NEXT: movl $0, %esi -; AVX512BW-NEXT: cmovgw %cx, %si +; AVX512BW-NEXT: cmovgl %ecx, %esi ; AVX512BW-NEXT: vmovd %esi, %xmm6 ; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $2, %xmm4, %edx ; AVX512BW-NEXT: vpextrw $2, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $3, %xmm4, %edx ; AVX512BW-NEXT: vpextrw $3, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $4, %xmm4, %edx ; AVX512BW-NEXT: vpextrw $4, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $5, %xmm4, %edx ; AVX512BW-NEXT: vpextrw $5, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $6, %xmm4, %edx ; AVX512BW-NEXT: vpextrw $6, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $7, %xmm4, %edx ; AVX512BW-NEXT: vpextrw $7, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm6, %xmm4 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 ; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4 @@ -6154,96 +6154,96 @@ ; AVX512BW-NEXT: vpextrw $1, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vmovd %xmm4, %esi ; AVX512BW-NEXT: vmovd %xmm5, %edi ; AVX512BW-NEXT: cmpw %si, %di ; AVX512BW-NEXT: movl $0, %esi -; AVX512BW-NEXT: cmovgw %cx, %si +; AVX512BW-NEXT: cmovgl %ecx, %esi ; AVX512BW-NEXT: vmovd %esi, %xmm6 ; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $2, %xmm4, %edx ; AVX512BW-NEXT: vpextrw $2, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $3, %xmm4, %edx ; AVX512BW-NEXT: vpextrw $3, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $4, %xmm4, %edx ; AVX512BW-NEXT: vpextrw $4, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $5, %xmm4, %edx ; AVX512BW-NEXT: vpextrw $5, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $6, %xmm4, %edx ; AVX512BW-NEXT: vpextrw $6, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrw $7, %xmm4, %edx ; AVX512BW-NEXT: vpextrw $7, %xmm5, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm6, %xmm4 ; AVX512BW-NEXT: vpextrw $1, %xmm3, %edx ; AVX512BW-NEXT: vpextrw $1, %xmm1, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vmovd %xmm3, %esi ; AVX512BW-NEXT: vmovd %xmm1, %edi ; AVX512BW-NEXT: cmpw %si, %di ; AVX512BW-NEXT: movl $0, %esi -; AVX512BW-NEXT: cmovgw %cx, %si +; AVX512BW-NEXT: cmovgl %ecx, %esi ; AVX512BW-NEXT: vmovd %esi, %xmm5 ; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrw $2, %xmm3, %edx ; AVX512BW-NEXT: vpextrw $2, %xmm1, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrw $3, %xmm3, %edx ; AVX512BW-NEXT: vpextrw $3, %xmm1, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrw $4, %xmm3, %edx ; AVX512BW-NEXT: vpextrw $4, %xmm1, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrw $5, %xmm3, %edx ; AVX512BW-NEXT: vpextrw $5, %xmm1, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrw $6, %xmm3, %edx ; AVX512BW-NEXT: vpextrw $6, %xmm1, %esi ; AVX512BW-NEXT: cmpw %dx, %si ; AVX512BW-NEXT: movl $0, %edx -; AVX512BW-NEXT: cmovgw %cx, %dx +; AVX512BW-NEXT: cmovgl %ecx, %edx ; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrw $7, %xmm3, %edx ; AVX512BW-NEXT: vpextrw $7, %xmm1, %esi ; AVX512BW-NEXT: cmpw %dx, %si -; AVX512BW-NEXT: cmovgw %cx, %ax +; AVX512BW-NEXT: cmovgl %ecx, %eax ; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm5, %xmm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1