diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3375,6 +3375,12 @@ Known = Known2.abs(); break; } + case ISD::USUBSAT: { + // The result of usubsat will never be larger than the LHS. + Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + Known.Zero.setHighBits(Known2.countMinLeadingZeros()); + break; + } case ISD::UMIN: { Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -83,16 +83,15 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v4, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_and_b32_e32 v5, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v3 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_max_u32_e32 v1, v1, v5 -; GFX6-NEXT: v_max_u32_e32 v0, v0, v4 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v4 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -119,23 +118,21 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v6, s4, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_and_b32_e32 v7, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v6, s4, v4 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_max_u32_e32 v1, v1, v7 -; GFX6-NEXT: v_max_u32_e32 v0, v0, v6 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v6 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v3 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_max_u32_e32 v2, v2, v5 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v2, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX6-NEXT: v_max_u32_e32 v1, v2, v5 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v1, v5 +; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v3i16: @@ -162,28 +159,26 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v10, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_and_b32_e32 v11, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v9, s4, v5 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_max_u32_e32 v1, v1, v11 -; GFX6-NEXT: v_max_u32_e32 v0, v0, v10 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v9 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v6 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v9, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v8, s4, v7 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_max_u32_e32 v2, v2, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_max_u32_e32 v1, v3, v9 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_max_u32_e32 v1, v2, v6 +; GFX6-NEXT: v_max_u32_e32 v2, v3, v8 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v4i16: diff --git a/llvm/test/CodeGen/ARM/usub_sat.ll b/llvm/test/CodeGen/ARM/usub_sat.ll --- a/llvm/test/CodeGen/ARM/usub_sat.ll +++ b/llvm/test/CodeGen/ARM/usub_sat.ll @@ -98,7 +98,6 @@ ; CHECK-T1-NEXT: @ %bb.1: ; CHECK-T1-NEXT: movs r0, #0 ; CHECK-T1-NEXT: .LBB2_2: -; CHECK-T1-NEXT: uxth r0, r0 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: func16: @@ -106,14 +105,12 @@ ; CHECK-T2-NEXT: subs r0, r0, r1 ; CHECK-T2-NEXT: it lo ; CHECK-T2-NEXT: movlo r0, #0 -; CHECK-T2-NEXT: uxth r0, r0 ; CHECK-T2-NEXT: bx lr ; ; CHECK-ARM-LABEL: func16: ; CHECK-ARM: @ %bb.0: ; CHECK-ARM-NEXT: subs r0, r0, r1 ; CHECK-ARM-NEXT: movlo r0, #0 -; CHECK-ARM-NEXT: uxth r0, r0 ; CHECK-ARM-NEXT: bx lr %tmp = call i16 @llvm.usub.sat.i16(i16 %x, i16 %y) ret i16 %tmp @@ -127,7 +124,6 @@ ; CHECK-T1-NEXT: @ %bb.1: ; CHECK-T1-NEXT: movs r0, #0 ; CHECK-T1-NEXT: .LBB3_2: -; CHECK-T1-NEXT: uxtb r0, r0 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: func8: @@ -135,14 +131,12 @@ ; CHECK-T2-NEXT: subs r0, r0, r1 ; CHECK-T2-NEXT: it lo ; CHECK-T2-NEXT: movlo r0, #0 -; CHECK-T2-NEXT: uxtb r0, r0 ; CHECK-T2-NEXT: bx lr ; ; CHECK-ARM-LABEL: func8: ; CHECK-ARM: @ %bb.0: ; CHECK-ARM-NEXT: subs r0, r0, r1 ; CHECK-ARM-NEXT: movlo r0, #0 -; CHECK-ARM-NEXT: uxtb r0, r0 ; CHECK-ARM-NEXT: bx lr %tmp = call i8 @llvm.usub.sat.i8(i8 %x, i8 %y) ret i8 %tmp @@ -151,13 +145,11 @@ define zeroext i4 @func3(i4 zeroext %x, i4 zeroext %y) nounwind { ; CHECK-T1-LABEL: func3: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: subs r1, r0, r1 +; CHECK-T1-NEXT: subs r0, r0, r1 ; CHECK-T1-NEXT: bhs .LBB4_2 ; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: movs r1, #0 +; CHECK-T1-NEXT: movs r0, #0 ; CHECK-T1-NEXT: .LBB4_2: -; CHECK-T1-NEXT: movs r0, #15 -; CHECK-T1-NEXT: ands r0, r1 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: func3: @@ -165,14 +157,12 @@ ; CHECK-T2-NEXT: subs r0, r0, r1 ; CHECK-T2-NEXT: it lo ; CHECK-T2-NEXT: movlo r0, #0 -; CHECK-T2-NEXT: and r0, r0, #15 ; CHECK-T2-NEXT: bx lr ; ; CHECK-ARM-LABEL: func3: ; CHECK-ARM: @ %bb.0: ; CHECK-ARM-NEXT: subs r0, r0, r1 ; CHECK-ARM-NEXT: movlo r0, #0 -; CHECK-ARM-NEXT: and r0, r0, #15 ; CHECK-ARM-NEXT: bx lr %tmp = call i4 @llvm.usub.sat.i4(i4 %x, i4 %y) ret i4 %tmp diff --git a/llvm/test/CodeGen/ARM/usub_sat_plus.ll b/llvm/test/CodeGen/ARM/usub_sat_plus.ll --- a/llvm/test/CodeGen/ARM/usub_sat_plus.ll +++ b/llvm/test/CodeGen/ARM/usub_sat_plus.ll @@ -110,7 +110,6 @@ ; CHECK-T1-NEXT: @ %bb.1: ; CHECK-T1-NEXT: movs r0, #0 ; CHECK-T1-NEXT: .LBB2_2: -; CHECK-T1-NEXT: uxth r0, r0 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: func16: @@ -120,7 +119,6 @@ ; CHECK-T2-NEXT: subs r0, r0, r1 ; CHECK-T2-NEXT: it lo ; CHECK-T2-NEXT: movlo r0, #0 -; CHECK-T2-NEXT: uxth r0, r0 ; CHECK-T2-NEXT: bx lr ; ; CHECK-ARM-LABEL: func16: @@ -129,7 +127,6 @@ ; CHECK-ARM-NEXT: uxth r1, r1 ; CHECK-ARM-NEXT: subs r0, r0, r1 ; CHECK-ARM-NEXT: movlo r0, #0 -; CHECK-ARM-NEXT: uxth r0, r0 ; CHECK-ARM-NEXT: bx lr %a = mul i16 %y, %z %tmp = call i16 @llvm.usub.sat.i16(i16 %x, i16 %a) @@ -146,7 +143,6 @@ ; CHECK-T1-NEXT: @ %bb.1: ; CHECK-T1-NEXT: movs r0, #0 ; CHECK-T1-NEXT: .LBB3_2: -; CHECK-T1-NEXT: uxtb r0, r0 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: func8: @@ -156,7 +152,6 @@ ; CHECK-T2-NEXT: subs r0, r0, r1 ; CHECK-T2-NEXT: it lo ; CHECK-T2-NEXT: movlo r0, #0 -; CHECK-T2-NEXT: uxtb r0, r0 ; CHECK-T2-NEXT: bx lr ; ; CHECK-ARM-LABEL: func8: @@ -165,7 +160,6 @@ ; CHECK-ARM-NEXT: uxtb r1, r1 ; CHECK-ARM-NEXT: subs r0, r0, r1 ; CHECK-ARM-NEXT: movlo r0, #0 -; CHECK-ARM-NEXT: uxtb r0, r0 ; CHECK-ARM-NEXT: bx lr %a = mul i8 %y, %z %tmp = call i8 @llvm.usub.sat.i8(i8 %x, i8 %a) @@ -177,13 +171,12 @@ ; CHECK-T1: @ %bb.0: ; CHECK-T1-NEXT: muls r1, r2, r1 ; CHECK-T1-NEXT: movs r2, #15 -; CHECK-T1-NEXT: ands r1, r2 -; CHECK-T1-NEXT: subs r0, r0, r1 +; CHECK-T1-NEXT: ands r2, r1 +; CHECK-T1-NEXT: subs r0, r0, r2 ; CHECK-T1-NEXT: bhs .LBB4_2 ; CHECK-T1-NEXT: @ %bb.1: ; CHECK-T1-NEXT: movs r0, #0 ; CHECK-T1-NEXT: .LBB4_2: -; CHECK-T1-NEXT: ands r0, r2 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: func4: @@ -193,7 +186,6 @@ ; CHECK-T2-NEXT: subs r0, r0, r1 ; CHECK-T2-NEXT: it lo ; CHECK-T2-NEXT: movlo r0, #0 -; CHECK-T2-NEXT: and r0, r0, #15 ; CHECK-T2-NEXT: bx lr ; ; CHECK-ARM-LABEL: func4: @@ -202,7 +194,6 @@ ; CHECK-ARM-NEXT: and r1, r1, #15 ; CHECK-ARM-NEXT: subs r0, r0, r1 ; CHECK-ARM-NEXT: movlo r0, #0 -; CHECK-ARM-NEXT: and r0, r0, #15 ; CHECK-ARM-NEXT: bx lr %a = mul i4 %y, %z %tmp = call i4 @llvm.usub.sat.i4(i4 %x, i4 %a) diff --git a/llvm/test/CodeGen/RISCV/usub_sat.ll b/llvm/test/CodeGen/RISCV/usub_sat.ll --- a/llvm/test/CodeGen/RISCV/usub_sat.ll +++ b/llvm/test/CodeGen/RISCV/usub_sat.ll @@ -124,42 +124,36 @@ define zeroext i16 @func16(i16 zeroext %x, i16 zeroext %y) nounwind { ; RV32I-LABEL: func16: ; RV32I: # %bb.0: -; RV32I-NEXT: sub a2, a0, a1 -; RV32I-NEXT: mv a1, zero -; RV32I-NEXT: bltu a0, a2, .LBB2_2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: sub a1, a0, a1 +; RV32I-NEXT: mv a0, zero +; RV32I-NEXT: bltu a2, a1, .LBB2_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a1, a2 +; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: .LBB2_2: -; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi a0, a0, -1 -; RV32I-NEXT: and a0, a1, a0 ; RV32I-NEXT: ret ; ; RV64I-LABEL: func16: ; RV64I: # %bb.0: -; RV64I-NEXT: sub a2, a0, a1 -; RV64I-NEXT: mv a1, zero -; RV64I-NEXT: bltu a0, a2, .LBB2_2 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: sub a1, a0, a1 +; RV64I-NEXT: mv a0, zero +; RV64I-NEXT: bltu a2, a1, .LBB2_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: mv a1, a2 +; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: .LBB2_2: -; RV64I-NEXT: lui a0, 16 -; RV64I-NEXT: addiw a0, a0, -1 -; RV64I-NEXT: and a0, a1, a0 ; RV64I-NEXT: ret ; ; RV32IZbb-LABEL: func16: ; RV32IZbb: # %bb.0: ; RV32IZbb-NEXT: maxu a0, a0, a1 ; RV32IZbb-NEXT: sub a0, a0, a1 -; RV32IZbb-NEXT: zext.h a0, a0 ; RV32IZbb-NEXT: ret ; ; RV64IZbb-LABEL: func16: ; RV64IZbb: # %bb.0: ; RV64IZbb-NEXT: maxu a0, a0, a1 ; RV64IZbb-NEXT: sub a0, a0, a1 -; RV64IZbb-NEXT: zext.h a0, a0 ; RV64IZbb-NEXT: ret %tmp = call i16 @llvm.usub.sat.i16(i16 %x, i16 %y); ret i16 %tmp; @@ -168,38 +162,36 @@ define zeroext i8 @func8(i8 zeroext %x, i8 zeroext %y) nounwind { ; RV32I-LABEL: func8: ; RV32I: # %bb.0: +; RV32I-NEXT: mv a2, a0 ; RV32I-NEXT: sub a1, a0, a1 -; RV32I-NEXT: mv a2, zero -; RV32I-NEXT: bltu a0, a1, .LBB3_2 +; RV32I-NEXT: mv a0, zero +; RV32I-NEXT: bltu a2, a1, .LBB3_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: .LBB3_2: -; RV32I-NEXT: andi a0, a2, 255 ; RV32I-NEXT: ret ; ; RV64I-LABEL: func8: ; RV64I: # %bb.0: +; RV64I-NEXT: mv a2, a0 ; RV64I-NEXT: sub a1, a0, a1 -; RV64I-NEXT: mv a2, zero -; RV64I-NEXT: bltu a0, a1, .LBB3_2 +; RV64I-NEXT: mv a0, zero +; RV64I-NEXT: bltu a2, a1, .LBB3_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: mv a2, a1 +; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: .LBB3_2: -; RV64I-NEXT: andi a0, a2, 255 ; RV64I-NEXT: ret ; ; RV32IZbb-LABEL: func8: ; RV32IZbb: # %bb.0: ; RV32IZbb-NEXT: maxu a0, a0, a1 ; RV32IZbb-NEXT: sub a0, a0, a1 -; RV32IZbb-NEXT: andi a0, a0, 255 ; RV32IZbb-NEXT: ret ; ; RV64IZbb-LABEL: func8: ; RV64IZbb: # %bb.0: ; RV64IZbb-NEXT: maxu a0, a0, a1 ; RV64IZbb-NEXT: sub a0, a0, a1 -; RV64IZbb-NEXT: andi a0, a0, 255 ; RV64IZbb-NEXT: ret %tmp = call i8 @llvm.usub.sat.i8(i8 %x, i8 %y); ret i8 %tmp; @@ -208,38 +200,36 @@ define zeroext i4 @func3(i4 zeroext %x, i4 zeroext %y) nounwind { ; RV32I-LABEL: func3: ; RV32I: # %bb.0: +; RV32I-NEXT: mv a2, a0 ; RV32I-NEXT: sub a1, a0, a1 -; RV32I-NEXT: mv a2, zero -; RV32I-NEXT: bltu a0, a1, .LBB4_2 +; RV32I-NEXT: mv a0, zero +; RV32I-NEXT: bltu a2, a1, .LBB4_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: .LBB4_2: -; RV32I-NEXT: andi a0, a2, 15 ; RV32I-NEXT: ret ; ; RV64I-LABEL: func3: ; RV64I: # %bb.0: +; RV64I-NEXT: mv a2, a0 ; RV64I-NEXT: sub a1, a0, a1 -; RV64I-NEXT: mv a2, zero -; RV64I-NEXT: bltu a0, a1, .LBB4_2 +; RV64I-NEXT: mv a0, zero +; RV64I-NEXT: bltu a2, a1, .LBB4_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: mv a2, a1 +; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: .LBB4_2: -; RV64I-NEXT: andi a0, a2, 15 ; RV64I-NEXT: ret ; ; RV32IZbb-LABEL: func3: ; RV32IZbb: # %bb.0: ; RV32IZbb-NEXT: maxu a0, a0, a1 ; RV32IZbb-NEXT: sub a0, a0, a1 -; RV32IZbb-NEXT: andi a0, a0, 15 ; RV32IZbb-NEXT: ret ; ; RV64IZbb-LABEL: func3: ; RV64IZbb: # %bb.0: ; RV64IZbb-NEXT: maxu a0, a0, a1 ; RV64IZbb-NEXT: sub a0, a0, a1 -; RV64IZbb-NEXT: andi a0, a0, 15 ; RV64IZbb-NEXT: ret %tmp = call i4 @llvm.usub.sat.i4(i4 %x, i4 %y); ret i4 %tmp; diff --git a/llvm/test/CodeGen/X86/combine-sub-usat.ll b/llvm/test/CodeGen/X86/combine-sub-usat.ll --- a/llvm/test/CodeGen/X86/combine-sub-usat.ll +++ b/llvm/test/CodeGen/X86/combine-sub-usat.ll @@ -138,7 +138,6 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE2-NEXT: psubusw %xmm1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq ; @@ -146,21 +145,21 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: psubusw %xmm1, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE41-NEXT: packuswb %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; SSE42-LABEL: combine_trunc_v8i16_v8i8: ; SSE42: # %bb.0: ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE42-NEXT: psubusw %xmm1, %xmm0 -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE42-NEXT: packuswb %xmm0, %xmm0 ; SSE42-NEXT: retq ; ; AVX-LABEL: combine_trunc_v8i16_v8i8: ; AVX: # %bb.0: ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = zext <8 x i8> %a0 to <8 x i16> %2 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %1, <8 x i16> %a1) diff --git a/llvm/test/CodeGen/X86/usub_sat.ll b/llvm/test/CodeGen/X86/usub_sat.ll --- a/llvm/test/CodeGen/X86/usub_sat.ll +++ b/llvm/test/CodeGen/X86/usub_sat.ll @@ -102,16 +102,16 @@ ; X86-NEXT: subb {{[0-9]+}}(%esp), %al ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: cmovbl %ecx, %eax -; X86-NEXT: andl $15, %eax +; X86-NEXT: movzbl %al, %eax ; X86-NEXT: retl ; ; X64-LABEL: func3: ; X64: # %bb.0: -; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: xorl %eax, %eax ; X64-NEXT: subb %sil, %dil -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: cmovbl %ecx, %eax -; X64-NEXT: andl $15, %eax +; X64-NEXT: movzbl %dil, %ecx +; X64-NEXT: cmovbl %eax, %ecx +; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: retq %tmp = call i4 @llvm.usub.sat.i4(i4 %x, i4 %y) ret i4 %tmp diff --git a/llvm/test/CodeGen/X86/usub_sat_plus.ll b/llvm/test/CodeGen/X86/usub_sat_plus.ll --- a/llvm/test/CodeGen/X86/usub_sat_plus.ll +++ b/llvm/test/CodeGen/X86/usub_sat_plus.ll @@ -119,7 +119,7 @@ ; X86-NEXT: subb %al, %cl ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: cmovbl %edx, %eax -; X86-NEXT: andl $15, %eax +; X86-NEXT: movzbl %al, %eax ; X86-NEXT: retl ; ; X64-LABEL: func4: @@ -132,7 +132,7 @@ ; X64-NEXT: subb %al, %dil ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: cmovbl %ecx, %eax -; X64-NEXT: andl $15, %eax +; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq %a = mul i4 %y, %z %tmp = call i4 @llvm.usub.sat.i4(i4 %x, i4 %a)