diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -772,6 +772,11 @@ EVT PromotedType = Op1Promoted.getValueType(); unsigned NewBits = PromotedType.getScalarSizeInBits(); + // USUBSAT can always be promoted as long as we have zero-extended the args. + if (Opcode == ISD::USUBSAT) + return DAG.getNode(ISD::USUBSAT, dl, PromotedType, Op1Promoted, + Op2Promoted); + // Shift cannot use a min/max expansion, we can't detect overflow if all of // the bits have been shifted out. if (IsShift || TLI.isOperationLegalOrCustom(Opcode, PromotedType)) { @@ -783,7 +788,6 @@ ShiftOp = ISD::SRA; break; case ISD::UADDSAT: - case ISD::USUBSAT: case ISD::USHLSAT: ShiftOp = ISD::SRL; break; @@ -806,12 +810,6 @@ return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount); } - if (Opcode == ISD::USUBSAT) { - SDValue Max = - DAG.getNode(ISD::UMAX, dl, PromotedType, Op1Promoted, Op2Promoted); - return DAG.getNode(ISD::SUB, dl, PromotedType, Max, Op2Promoted); - } - if (Opcode == ISD::UADDSAT) { APInt MaxVal = APInt::getAllOnesValue(OldBits).zext(NewBits); SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType); diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll @@ -129,10 +129,7 @@ ; CHECK-NEXT: mov v1.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w10 ; CHECK-NEXT: mov v1.h[3], w11 -; CHECK-NEXT: shl v1.4h, v1.4h, #8 -; CHECK-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-NEXT: uqsub v0.4h, v0.4h, v1.4h -; CHECK-NEXT: ushr v0.4h, v0.4h, #8 ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: str s0, [x2] ; CHECK-NEXT: ret @@ -154,10 +151,7 @@ ; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: shl v1.2s, v1.2s, #24 -; CHECK-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-NEXT: uqsub v0.2s, v0.2s, v1.2s -; CHECK-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strb w9, [x2] @@ -196,10 +190,7 @@ ; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: shl v1.2s, v1.2s, #16 -; CHECK-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-NEXT: uqsub v0.2s, v0.2s, v1.2s -; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strh w9, [x2] @@ -272,12 +263,9 @@ ; CHECK-LABEL: v16i4: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.16b, #15 -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: shl v1.16b, v1.16b, #4 -; CHECK-NEXT: shl v0.16b, v0.16b, #4 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: uqsub v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushr v0.16b, v0.16b, #4 ; CHECK-NEXT: ret %z = call <16 x i4> @llvm.usub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z @@ -287,12 +275,9 @@ ; CHECK-LABEL: v16i1: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.16b, #1 -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: shl v1.16b, v1.16b, #7 -; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: uqsub v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushr v0.16b, v0.16b, #7 ; CHECK-NEXT: ret %z = call <16 x i1> @llvm.usub.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -17,19 +17,13 @@ ; GFX8-LABEL: v_usubsat_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp -; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp -; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result diff --git a/llvm/test/CodeGen/ARM/usub_sat_plus.ll b/llvm/test/CodeGen/ARM/usub_sat_plus.ll --- a/llvm/test/CodeGen/ARM/usub_sat_plus.ll +++ b/llvm/test/CodeGen/ARM/usub_sat_plus.ll @@ -104,34 +104,31 @@ ; CHECK-T1-LABEL: func16: ; CHECK-T1: @ %bb.0: ; CHECK-T1-NEXT: muls r1, r2, r1 -; CHECK-T1-NEXT: uxth r2, r1 -; CHECK-T1-NEXT: cmp r0, r2 -; CHECK-T1-NEXT: bhi .LBB2_2 +; CHECK-T1-NEXT: uxth r1, r1 +; CHECK-T1-NEXT: subs r0, r0, r1 +; CHECK-T1-NEXT: bhs .LBB2_2 ; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: mov r0, r2 +; CHECK-T1-NEXT: movs r0, #0 ; CHECK-T1-NEXT: .LBB2_2: -; CHECK-T1-NEXT: subs r0, r0, r1 ; CHECK-T1-NEXT: uxth r0, r0 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: func16: ; CHECK-T2: @ %bb.0: -; CHECK-T2-NEXT: mul r3, r1, r2 -; CHECK-T2-NEXT: uxth r3, r3 -; CHECK-T2-NEXT: cmp r0, r3 -; CHECK-T2-NEXT: it hi -; CHECK-T2-NEXT: movhi r3, r0 -; CHECK-T2-NEXT: mls r0, r1, r2, r3 +; CHECK-T2-NEXT: muls r1, r2, r1 +; CHECK-T2-NEXT: uxth r1, r1 +; CHECK-T2-NEXT: subs r0, r0, r1 +; CHECK-T2-NEXT: it lo +; CHECK-T2-NEXT: movlo r0, #0 ; CHECK-T2-NEXT: uxth r0, r0 ; CHECK-T2-NEXT: bx lr ; ; CHECK-ARM-LABEL: func16: ; CHECK-ARM: @ %bb.0: -; CHECK-ARM-NEXT: mul r3, r1, r2 -; CHECK-ARM-NEXT: uxth r3, r3 -; CHECK-ARM-NEXT: cmp r0, r3 -; CHECK-ARM-NEXT: movhi r3, r0 -; CHECK-ARM-NEXT: mls r0, r1, r2, r3 +; CHECK-ARM-NEXT: mul r1, r1, r2 +; CHECK-ARM-NEXT: uxth r1, r1 +; CHECK-ARM-NEXT: subs r0, r0, r1 +; CHECK-ARM-NEXT: movlo r0, #0 ; CHECK-ARM-NEXT: uxth r0, r0 ; CHECK-ARM-NEXT: bx lr %a = mul i16 %y, %z @@ -143,34 +140,31 @@ ; CHECK-T1-LABEL: func8: ; CHECK-T1: @ %bb.0: ; CHECK-T1-NEXT: muls r1, r2, r1 -; CHECK-T1-NEXT: uxtb r2, r1 -; CHECK-T1-NEXT: cmp r0, r2 -; CHECK-T1-NEXT: bhi .LBB3_2 +; CHECK-T1-NEXT: uxtb r1, r1 +; CHECK-T1-NEXT: subs r0, r0, r1 +; CHECK-T1-NEXT: bhs .LBB3_2 ; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: mov r0, r2 +; CHECK-T1-NEXT: movs r0, #0 ; CHECK-T1-NEXT: .LBB3_2: -; CHECK-T1-NEXT: subs r0, r0, r1 ; CHECK-T1-NEXT: uxtb r0, r0 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: func8: ; CHECK-T2: @ %bb.0: -; CHECK-T2-NEXT: mul r3, r1, r2 -; CHECK-T2-NEXT: uxtb r3, r3 -; CHECK-T2-NEXT: cmp r0, r3 -; CHECK-T2-NEXT: it hi -; CHECK-T2-NEXT: movhi r3, r0 -; CHECK-T2-NEXT: mls r0, r1, r2, r3 +; CHECK-T2-NEXT: muls r1, r2, r1 +; CHECK-T2-NEXT: uxtb r1, r1 +; CHECK-T2-NEXT: subs r0, r0, r1 +; CHECK-T2-NEXT: it lo +; CHECK-T2-NEXT: movlo r0, #0 ; CHECK-T2-NEXT: uxtb r0, r0 ; CHECK-T2-NEXT: bx lr ; ; CHECK-ARM-LABEL: func8: ; CHECK-ARM: @ %bb.0: -; CHECK-ARM-NEXT: smulbb r3, r1, r2 -; CHECK-ARM-NEXT: uxtb r3, r3 -; CHECK-ARM-NEXT: cmp r0, r3 -; CHECK-ARM-NEXT: movhi r3, r0 -; CHECK-ARM-NEXT: mls r0, r1, r2, r3 +; CHECK-ARM-NEXT: smulbb r1, r1, r2 +; CHECK-ARM-NEXT: uxtb r1, r1 +; CHECK-ARM-NEXT: subs r0, r0, r1 +; CHECK-ARM-NEXT: movlo r0, #0 ; CHECK-ARM-NEXT: uxtb r0, r0 ; CHECK-ARM-NEXT: bx lr %a = mul i8 %y, %z @@ -183,35 +177,31 @@ ; CHECK-T1: @ %bb.0: ; CHECK-T1-NEXT: muls r1, r2, r1 ; CHECK-T1-NEXT: movs r2, #15 -; CHECK-T1-NEXT: mov r3, r1 -; CHECK-T1-NEXT: ands r3, r2 -; CHECK-T1-NEXT: cmp r0, r3 -; CHECK-T1-NEXT: bhi .LBB4_2 +; CHECK-T1-NEXT: ands r1, r2 +; CHECK-T1-NEXT: subs r0, r0, r1 +; CHECK-T1-NEXT: bhs .LBB4_2 ; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: mov r0, r3 +; CHECK-T1-NEXT: movs r0, #0 ; CHECK-T1-NEXT: .LBB4_2: -; CHECK-T1-NEXT: subs r0, r0, r1 ; CHECK-T1-NEXT: ands r0, r2 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: func4: ; CHECK-T2: @ %bb.0: -; CHECK-T2-NEXT: mul r3, r1, r2 -; CHECK-T2-NEXT: and r3, r3, #15 -; CHECK-T2-NEXT: cmp r0, r3 -; CHECK-T2-NEXT: it hi -; CHECK-T2-NEXT: movhi r3, r0 -; CHECK-T2-NEXT: mls r0, r1, r2, r3 +; CHECK-T2-NEXT: muls r1, r2, r1 +; CHECK-T2-NEXT: and r1, r1, #15 +; CHECK-T2-NEXT: subs r0, r0, r1 +; CHECK-T2-NEXT: it lo +; CHECK-T2-NEXT: movlo r0, #0 ; CHECK-T2-NEXT: and r0, r0, #15 ; CHECK-T2-NEXT: bx lr ; ; CHECK-ARM-LABEL: func4: ; CHECK-ARM: @ %bb.0: -; CHECK-ARM-NEXT: smulbb r3, r1, r2 -; CHECK-ARM-NEXT: and r3, r3, #15 -; CHECK-ARM-NEXT: cmp r0, r3 -; CHECK-ARM-NEXT: movhi r3, r0 -; CHECK-ARM-NEXT: mls r0, r1, r2, r3 +; CHECK-ARM-NEXT: smulbb r1, r1, r2 +; CHECK-ARM-NEXT: and r1, r1, #15 +; CHECK-ARM-NEXT: subs r0, r0, r1 +; CHECK-ARM-NEXT: movlo r0, #0 ; CHECK-ARM-NEXT: and r0, r0, #15 ; CHECK-ARM-NEXT: bx lr %a = mul i4 %y, %z diff --git a/llvm/test/CodeGen/X86/usub_sat_plus.ll b/llvm/test/CodeGen/X86/usub_sat_plus.ll --- a/llvm/test/CodeGen/X86/usub_sat_plus.ll +++ b/llvm/test/CodeGen/X86/usub_sat_plus.ll @@ -111,22 +111,15 @@ define zeroext i4 @func4(i4 zeroext %x, i4 zeroext %y, i4 zeroext %z) nounwind { ; X86-LABEL: func4: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %esi ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: mulb {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %edx -; X86-NEXT: andb $15, %dl -; X86-NEXT: movzbl %dl, %esi -; X86-NEXT: movzbl %cl, %ebx -; X86-NEXT: cmpb %dl, %cl -; X86-NEXT: cmovbel %esi, %ebx -; X86-NEXT: subb %al, %bl -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: andb $15, %al +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: subb %al, %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: cmovbl %edx, %eax ; X86-NEXT: andl $15, %eax -; X86-NEXT: popl %esi -; X86-NEXT: popl %ebx ; X86-NEXT: retl ; ; X64-LABEL: func4: @@ -134,13 +127,11 @@ ; X64-NEXT: movl %esi, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: mulb %dl -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: andb $15, %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: cmpb %cl, %dil -; X64-NEXT: cmoval %edi, %ecx -; X64-NEXT: subb %al, %cl -; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: andb $15, %al +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: subb %al, %dil +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: cmovbl %ecx, %eax ; X64-NEXT: andl $15, %eax ; X64-NEXT: retq %a = mul i4 %y, %z diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll --- a/llvm/test/CodeGen/X86/usub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -481,26 +481,18 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; SSE-LABEL: v16i4: ; SSE: # %bb.0: -; SSE-NEXT: psllw $4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $4, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: psubusb %xmm1, %xmm0 -; SSE-NEXT: psrlw $4, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: v16i4: ; AVX: # %bb.0: -; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <16 x i4> @llvm.usub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z @@ -509,38 +501,26 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { ; SSE-LABEL: v16i1: ; SSE: # %bb.0: -; SSE-NEXT: psllw $7, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $7, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: psubusb %xmm1, %xmm0 -; SSE-NEXT: psrlw $7, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: v16i1: