diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -784,6 +784,14 @@ EVT PromotedType = Op1Promoted.getValueType(); unsigned NewBits = PromotedType.getScalarSizeInBits(); + if (Opcode == ISD::UADDSAT) { + APInt MaxVal = APInt::getAllOnesValue(OldBits).zext(NewBits); + SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType); + SDValue Add = + DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted); + return DAG.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax); + } + // USUBSAT can always be promoted as long as we have zero-extended the args. if (Opcode == ISD::USUBSAT) return DAG.getNode(ISD::USUBSAT, dl, PromotedType, Op1Promoted, @@ -799,7 +807,6 @@ case ISD::SSHLSAT: ShiftOp = ISD::SRA; break; - case ISD::UADDSAT: case ISD::USHLSAT: ShiftOp = ISD::SRL; break; @@ -822,14 +829,6 @@ return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount); } - if (Opcode == ISD::UADDSAT) { - APInt MaxVal = APInt::getAllOnesValue(OldBits).zext(NewBits); - SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType); - SDValue Add = - DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted); - return DAG.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax); - } - unsigned AddOp = Opcode == ISD::SADDSAT ? ISD::ADD : ISD::SUB; APInt MinVal = APInt::getSignedMinValue(OldBits).sext(NewBits); APInt MaxVal = APInt::getSignedMaxValue(OldBits).sext(NewBits); diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -116,22 +116,21 @@ ; CHECK-NEXT: ldrb w9, [x1] ; CHECK-NEXT: ldrb w10, [x0, #1] ; CHECK-NEXT: ldrb w11, [x1, #1] +; CHECK-NEXT: ldrb w12, [x0, #2] ; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldrb w8, [x1, #2] ; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: ldrb w8, [x0, #2] -; CHECK-NEXT: ldrb w9, [x1, #2] ; CHECK-NEXT: mov v0.h[1], w10 +; CHECK-NEXT: ldrb w9, [x0, #3] +; CHECK-NEXT: ldrb w10, [x1, #3] ; CHECK-NEXT: mov v1.h[1], w11 -; CHECK-NEXT: ldrb w10, [x0, #3] -; CHECK-NEXT: ldrb w11, [x1, #3] -; CHECK-NEXT: mov v0.h[2], w8 -; CHECK-NEXT: mov v1.h[2], w9 -; CHECK-NEXT: mov v0.h[3], w10 -; CHECK-NEXT: mov v1.h[3], w11 -; CHECK-NEXT: shl v1.4h, v1.4h, #8 -; CHECK-NEXT: shl v0.4h, v0.4h, #8 -; CHECK-NEXT: uqadd v0.4h, v0.4h, v1.4h -; CHECK-NEXT: ushr v0.4h, v0.4h, #8 +; CHECK-NEXT: mov v0.h[2], w12 +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: mov v0.h[3], w9 +; CHECK-NEXT: mov v1.h[3], w10 +; CHECK-NEXT: movi d2, #0xff00ff00ff00ff +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: umin v0.4h, v0.4h, v2.4h ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: str s0, [x2] ; CHECK-NEXT: ret @@ -150,13 +149,12 @@ ; CHECK-NEXT: ldrb w10, [x0, #1] ; CHECK-NEXT: ldrb w11, [x1, #1] ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: shl v1.2s, v1.2s, #24 -; CHECK-NEXT: shl v0.2s, v0.2s, #24 -; CHECK-NEXT: uqadd v0.2s, v0.2s, v1.2s -; CHECK-NEXT: ushr v0.2s, v0.2s, #24 +; CHECK-NEXT: mov v2.s[1], w11 +; CHECK-NEXT: movi d1, #0x0000ff000000ff +; CHECK-NEXT: add v0.2s, v0.2s, v2.2s +; CHECK-NEXT: umin v0.2s, v0.2s, v1.2s ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strb w9, [x2] @@ -192,13 +190,12 @@ ; CHECK-NEXT: ldrh w10, [x0, #2] ; CHECK-NEXT: ldrh w11, [x1, #2] ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: shl v1.2s, v1.2s, #16 -; CHECK-NEXT: shl v0.2s, v0.2s, #16 -; CHECK-NEXT: uqadd v0.2s, v0.2s, v1.2s -; CHECK-NEXT: ushr v0.2s, v0.2s, #16 +; CHECK-NEXT: mov v2.s[1], w11 +; CHECK-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-NEXT: add v0.2s, v0.2s, v2.2s +; CHECK-NEXT: umin v0.2s, v0.2s, v1.2s ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strh w9, [x2] @@ -271,12 +268,10 @@ ; CHECK-LABEL: v16i4: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.16b, #15 -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: shl v1.16b, v1.16b, #4 -; CHECK-NEXT: shl v0.16b, v0.16b, #4 -; CHECK-NEXT: uqadd v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushr v0.16b, v0.16b, #4 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umin v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %z = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -17,19 +17,15 @@ ; GFX8-LABEL: v_uaddsat_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp -; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_min_u16_e32 v0, 0xff, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp -; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_min_u16_e32 v0, 0xff, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll --- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll @@ -481,26 +481,20 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; SSE-LABEL: v16i4: ; SSE: # %bb.0: -; SSE-NEXT: psllw $4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $4, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: paddusb %xmm1, %xmm0 -; SSE-NEXT: psrlw $4, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: pminub %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: v16i4: ; AVX: # %bb.0: -; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z