diff --git a/llvm/include/llvm/Support/KnownBits.h b/llvm/include/llvm/Support/KnownBits.h --- a/llvm/include/llvm/Support/KnownBits.h +++ b/llvm/include/llvm/Support/KnownBits.h @@ -296,6 +296,12 @@ /// Compute known bits resulting from multiplying LHS and RHS. static KnownBits computeForMul(const KnownBits &LHS, const KnownBits &RHS); + /// Compute known bits from sign-extened multiply-hi. + static KnownBits mulhs(const KnownBits &LHS, const KnownBits &RHS); + + /// Compute known bits from sign-extened multiply-hi. + static KnownBits mulhu(const KnownBits &LHS, const KnownBits &RHS); + /// Compute known bits for udiv(LHS, RHS). static KnownBits udiv(const KnownBits &LHS, const KnownBits &RHS); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2979,6 +2979,20 @@ Known = KnownBits::computeForMul(Known, Known2); break; } + case ISD::MULHU: { + // mulhu(x,y) -> trunc(srl(mul(zext(x),zext(y)),bw)). + Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + Known = KnownBits::mulhu(Known, Known2); + break; + } + case ISD::MULHS: { + // mulhs(x,y) -> trunc(srl(mul(sext(x),sext(y)),bw)). + Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + Known = KnownBits::mulhs(Known, Known2); + break; + } case ISD::UDIV: { Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp --- a/llvm/lib/Support/KnownBits.cpp +++ b/llvm/lib/Support/KnownBits.cpp @@ -489,6 +489,24 @@ return Res; } +KnownBits KnownBits::mulhs(const KnownBits &LHS, const KnownBits &RHS) { + unsigned BitWidth = LHS.getBitWidth(); + assert(BitWidth == RHS.getBitWidth() && !LHS.hasConflict() && + !RHS.hasConflict() && "Operand mismatch"); + KnownBits WideLHS = LHS.sext(2 * BitWidth); + KnownBits WideRHS = RHS.sext(2 * BitWidth); + return computeForMul(WideLHS, WideRHS).extractBits(BitWidth, BitWidth); +} + +KnownBits KnownBits::mulhu(const KnownBits &LHS, const KnownBits &RHS) { + unsigned BitWidth = LHS.getBitWidth(); + assert(BitWidth == RHS.getBitWidth() && !LHS.hasConflict() && + !RHS.hasConflict() && "Operand mismatch"); + KnownBits WideLHS = LHS.zext(2 * BitWidth); + KnownBits WideRHS = RHS.zext(2 * BitWidth); + return computeForMul(WideLHS, WideRHS).extractBits(BitWidth, BitWidth); +} + KnownBits KnownBits::udiv(const KnownBits &LHS, const KnownBits &RHS) { unsigned BitWidth = LHS.getBitWidth(); assert(!LHS.hasConflict() && !RHS.hasConflict()); diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -609,14 +609,14 @@ ; GFX7LESS-NEXT: s_cbranch_execz BB3_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 ; GFX7LESS-NEXT: s_mov_b32 s10, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s8, s2 ; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX7LESS-NEXT: s_mul_i32 s3, s2, 5 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) @@ -651,12 +651,12 @@ ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: s_mov_b32 s8, s2 ; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX89-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 ; GFX89-NEXT: s_mul_i32 s2, s2, 5 ; GFX89-NEXT: s_mov_b32 s11, 0xf000 ; GFX89-NEXT: s_mov_b32 s10, -1 ; GFX89-NEXT: s_mov_b32 s9, s3 ; GFX89-NEXT: v_mov_b32_e32 v1, s2 +; GFX89-NEXT: v_mov_b32_e32 v2, 0 ; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX89-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) @@ -687,10 +687,10 @@ ; GCN64-NEXT: s_cbranch_execz BB3_2 ; GCN64-NEXT: ; %bb.1: ; GCN64-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GCN64-NEXT: v_mov_b32_e32 v2, 0 +; GCN64-NEXT: s_mul_i32 s6, s6, 5 ; GCN64-NEXT: s_mov_b32 s11, 0x31016000 -; GCN64-NEXT: s_mul_i32 s7, s6, 5 -; GCN64-NEXT: v_mul_hi_u32_u24_e64 v2, s6, 5 -; GCN64-NEXT: v_mov_b32_e32 v1, s7 +; GCN64-NEXT: v_mov_b32_e32 v1, s6 ; GCN64-NEXT: s_mov_b32 s10, -1 ; GCN64-NEXT: s_waitcnt lgkmcnt(0) ; GCN64-NEXT: s_mov_b32 s8, s2 @@ -724,10 +724,10 @@ ; GCN32-NEXT: s_cbranch_execz BB3_2 ; GCN32-NEXT: ; %bb.1: ; GCN32-NEXT: s_bcnt1_i32_b32 s5, s5 +; GCN32-NEXT: v_mov_b32_e32 v2, 0 +; GCN32-NEXT: s_mul_i32 s5, s5, 5 ; GCN32-NEXT: s_mov_b32 s11, 0x31016000 -; GCN32-NEXT: s_mul_i32 s6, s5, 5 -; GCN32-NEXT: v_mul_hi_u32_u24_e64 v2, s5, 5 -; GCN32-NEXT: v_mov_b32_e32 v1, s6 +; GCN32-NEXT: v_mov_b32_e32 v1, s5 ; GCN32-NEXT: s_mov_b32 s10, -1 ; GCN32-NEXT: s_waitcnt lgkmcnt(0) ; GCN32-NEXT: s_mov_b32 s8, s2 @@ -1700,14 +1700,14 @@ ; GFX7LESS-NEXT: s_cbranch_execz BB9_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 ; GFX7LESS-NEXT: s_mov_b32 s10, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s8, s2 ; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX7LESS-NEXT: s_mul_i32 s3, s2, 5 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) @@ -1742,12 +1742,12 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s8, s2 ; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 ; GFX8-NEXT: s_mul_i32 s2, s2, 5 ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s9, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1781,12 +1781,12 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s8, s2 ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 ; GFX9-NEXT: s_mul_i32 s2, s2, 5 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1818,10 +1818,10 @@ ; GCN64-NEXT: s_cbranch_execz BB9_2 ; GCN64-NEXT: ; %bb.1: ; GCN64-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GCN64-NEXT: v_mov_b32_e32 v2, 0 +; GCN64-NEXT: s_mul_i32 s6, s6, 5 ; GCN64-NEXT: s_mov_b32 s11, 0x31016000 -; GCN64-NEXT: s_mul_i32 s7, s6, 5 -; GCN64-NEXT: v_mul_hi_u32_u24_e64 v2, s6, 5 -; GCN64-NEXT: v_mov_b32_e32 v1, s7 +; GCN64-NEXT: v_mov_b32_e32 v1, s6 ; GCN64-NEXT: s_mov_b32 s10, -1 ; GCN64-NEXT: s_waitcnt lgkmcnt(0) ; GCN64-NEXT: s_mov_b32 s8, s2 @@ -1858,10 +1858,10 @@ ; GCN32-NEXT: s_cbranch_execz BB9_2 ; GCN32-NEXT: ; %bb.1: ; GCN32-NEXT: s_bcnt1_i32_b32 s5, s5 +; GCN32-NEXT: v_mov_b32_e32 v2, 0 +; GCN32-NEXT: s_mul_i32 s5, s5, 5 ; GCN32-NEXT: s_mov_b32 s11, 0x31016000 -; GCN32-NEXT: s_mul_i32 s6, s5, 5 -; GCN32-NEXT: v_mul_hi_u32_u24_e64 v2, s5, 5 -; GCN32-NEXT: v_mov_b32_e32 v1, s6 +; GCN32-NEXT: v_mov_b32_e32 v1, s5 ; GCN32-NEXT: s_mov_b32 s10, -1 ; GCN32-NEXT: s_waitcnt lgkmcnt(0) ; GCN32-NEXT: s_mov_b32 s8, s2 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1058,10 +1058,10 @@ ; GFX7LESS-NEXT: s_cbranch_execz BB5_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX7LESS-NEXT: s_mul_i32 s5, s4, 5 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] @@ -1093,9 +1093,9 @@ ; GFX8-NEXT: s_cbranch_execz BB5_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1127,9 +1127,9 @@ ; GFX9-NEXT: s_cbranch_execz BB5_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] @@ -1160,10 +1160,10 @@ ; GFX1064-NEXT: s_cbranch_execz BB5_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_mul_i32 s4, s4, 5 ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX1064-NEXT: s_mul_i32 s5, s4, 5 -; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v1, s5 +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] @@ -1192,10 +1192,10 @@ ; GFX1032-NEXT: s_cbranch_execz BB5_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_mul_i32 s3, s3, 5 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX1032-NEXT: s_mul_i32 s4, s3, 5 -; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] @@ -2104,10 +2104,10 @@ ; GFX7LESS-NEXT: s_cbranch_execz BB11_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX7LESS-NEXT: s_mul_i32 s5, s4, 5 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] @@ -2139,9 +2139,9 @@ ; GFX8-NEXT: s_cbranch_execz BB11_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2174,9 +2174,9 @@ ; GFX9-NEXT: s_cbranch_execz BB11_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] @@ -2208,10 +2208,10 @@ ; GFX1064-NEXT: s_cbranch_execz BB11_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_mul_i32 s4, s4, 5 ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX1064-NEXT: s_mul_i32 s5, s4, 5 -; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v1, s5 +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] @@ -2243,10 +2243,10 @@ ; GFX1032-NEXT: s_cbranch_execz BB11_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_mul_i32 s3, s3, 5 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX1032-NEXT: s_mul_i32 s4, s3, 5 -; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -1202,15 +1202,14 @@ ; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v6, s[0:1] ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_hi_u32 v5, 24, v0 ; GCN-NEXT: v_mul_lo_u32 v4, v3, 24 -; GCN-NEXT: v_mul_hi_u32 v6, 24, v3 -; GCN-NEXT: v_mul_hi_u32 v0, 0, v0 +; GCN-NEXT: v_mul_hi_u32 v0, 24, v0 +; GCN-NEXT: v_mul_hi_u32 v5, 24, v3 ; GCN-NEXT: v_mul_hi_u32 v3, 0, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v6, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0, v4 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v2, v0, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0, v0 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GCN-NEXT: v_mul_lo_u32 v2, s8, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 @@ -1420,15 +1419,14 @@ ; GCN-NEXT: v_addc_u32_e64 v4, vcc, v4, v6, s[4:5] ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v6, 24, v3 ; GCN-NEXT: v_mul_lo_u32 v5, v4, 24 -; GCN-NEXT: v_mul_hi_u32 v7, 24, v4 -; GCN-NEXT: v_mul_hi_u32 v3, 0, v3 +; GCN-NEXT: v_mul_hi_u32 v3, 24, v3 +; GCN-NEXT: v_mul_hi_u32 v6, 24, v4 ; GCN-NEXT: v_mul_hi_u32 v4, 0, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v13, v7, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0, v5 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v13, v6, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, 0, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 @@ -1633,15 +1631,14 @@ ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: s_mov_b32 s4, 0x8000 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v5, s4, v3 -; GCN-NEXT: v_mul_hi_u32 v6, s4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 15, v4 -; GCN-NEXT: v_mul_hi_u32 v3, 0, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s4, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 15, v4 ; GCN-NEXT: v_mul_hi_u32 v4, 0, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0, v5 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, 0, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1390,15 +1390,14 @@ ; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v6, s[0:1] ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_hi_u32 v5, 24, v0 ; GCN-NEXT: v_mul_lo_u32 v4, v3, 24 -; GCN-NEXT: v_mul_hi_u32 v6, 24, v3 -; GCN-NEXT: v_mul_hi_u32 v0, 0, v0 +; GCN-NEXT: v_mul_hi_u32 v0, 24, v0 +; GCN-NEXT: v_mul_hi_u32 v5, 24, v3 ; GCN-NEXT: v_mul_hi_u32 v3, 0, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v6, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0, v4 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v2, v0, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0, v0 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GCN-NEXT: v_mul_lo_u32 v1, s8, v1 ; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 @@ -1605,15 +1604,14 @@ ; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v5, s[4:5] ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_hi_u32 v5, 24, v2 ; GCN-NEXT: v_mul_lo_u32 v4, v3, 24 -; GCN-NEXT: v_mul_hi_u32 v6, 24, v3 -; GCN-NEXT: v_mul_hi_u32 v2, 0, v2 +; GCN-NEXT: v_mul_hi_u32 v2, 24, v2 +; GCN-NEXT: v_mul_hi_u32 v5, 24, v3 ; GCN-NEXT: v_mul_hi_u32 v3, 0, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v12, v6, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v12, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0, v2 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc ; GCN-NEXT: v_mul_lo_u32 v3, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 @@ -1816,15 +1814,14 @@ ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: s_mov_b32 s4, 0x8000 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_hi_u32 v4, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v5, s4, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 15, v3 -; GCN-NEXT: v_mul_hi_u32 v2, 0, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GCN-NEXT: v_mul_hi_u32 v2, s4, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s4, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 15, v3 ; GCN-NEXT: v_mul_hi_u32 v3, 0, v3 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0, v2 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc ; GCN-NEXT: v_mul_lo_u32 v3, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -969,14 +969,14 @@ ; GCN-NEXT: v_mul_hi_u32 v5, v0, v6 ; GCN-NEXT: v_mul_lo_u32 v7, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v9, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v8, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v6, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v8, v3, v6 +; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v3, v4 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v2, v9, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc +; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v8, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 @@ -999,27 +999,24 @@ ; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v2, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v6, s[0:1] +; GCN-NEXT: v_addc_u32_e64 v1, vcc, v3, v1, s[0:1] ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v3, 24 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_mul_lo_u32 v3, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 -; GCN-NEXT: v_mul_hi_u32 v3, v3, 24 +; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 ; GCN-NEXT: v_mov_b32_e32 v5, s7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v2, v3, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v2, v1, vcc +; GCN-NEXT: v_mul_lo_u32 v1, s7, v0 ; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-NEXT: v_mul_lo_u32 v3, s6, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 24, v3 ; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc ; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s6, v3 @@ -1031,21 +1028,21 @@ ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v4 ; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 -; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] +; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v2, s[0:1] +; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 -; GCN-NEXT: v_subb_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] +; GCN-NEXT: v_addc_u32_e64 v2, s[0:1], 0, v2, s[0:1] +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 -; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s7, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s7, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll @@ -9,7 +9,7 @@ ; CHECK-NEXT: s_mov_b32 s4, 0xcccccccd ; CHECK-NEXT: v_mul_hi_u32 v1, v0, s4 ; CHECK-NEXT: v_lshrrev_b32_e32 v1, 2, v1 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, 5 +; CHECK-NEXT: v_mul_u32_u24_e32 v1, 5, v1 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -28,7 +28,7 @@ ; CHECK-NEXT: s_mov_b32 s4, 0x92492493 ; CHECK-NEXT: v_mul_hi_u32 v0, v0, s4 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 2, v0 -; CHECK-NEXT: v_mul_lo_u32 v0, v0, 14 +; CHECK-NEXT: v_mul_u32_u24_e32 v0, 14, v0 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -46,7 +46,7 @@ ; CHECK-NEXT: s_mov_b32 s4, 0xcccccccd ; CHECK-NEXT: v_mul_hi_u32 v1, v0, s4 ; CHECK-NEXT: v_lshrrev_b32_e32 v1, 2, v1 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, 5 +; CHECK-NEXT: v_mul_u32_u24_e32 v1, 5, v1 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -62,10 +62,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v0, 0x1ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0x2050c9f9 -; CHECK-NEXT: s_movk_i32 s5, 0x1fb ; CHECK-NEXT: v_mul_hi_u32 v1, v0, s4 ; CHECK-NEXT: v_lshrrev_b32_e32 v1, 6, v1 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, s5 +; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x1fb, v1 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -779,14 +779,14 @@ ; GCN-NEXT: v_mul_hi_u32 v5, v0, v6 ; GCN-NEXT: v_mul_lo_u32 v7, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v9, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v8, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v6, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v8, v3, v6 +; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v3, v4 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v2, v9, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc +; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v8, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 @@ -809,27 +809,24 @@ ; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v2, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v6, s[0:1] +; GCN-NEXT: v_addc_u32_e64 v1, vcc, v3, v1, s[0:1] ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v3, 24 -; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 -; GCN-NEXT: v_mul_hi_u32 v3, v3, 24 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v2, v3, vcc ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v1, s6, v1 +; GCN-NEXT: v_mul_lo_u32 v3, v1, 24 +; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 +; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v2, v1, vcc +; GCN-NEXT: v_mul_lo_u32 v1, s7, v0 ; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s7, v0 ; GCN-NEXT: v_mul_lo_u32 v0, s6, v0 +; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 -; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s6, v0 diff --git a/llvm/test/CodeGen/ARM/select-imm.ll b/llvm/test/CodeGen/ARM/select-imm.ll --- a/llvm/test/CodeGen/ARM/select-imm.ll +++ b/llvm/test/CodeGen/ARM/select-imm.ll @@ -684,29 +684,16 @@ ; ARM: @ %bb.0: @ %entry ; ARM-NEXT: .pad #4 ; ARM-NEXT: sub sp, sp, #4 -; ARM-NEXT: ldr r0, .LCPI10_0 -; ARM-NEXT: mov r1, #33 -; ARM-NEXT: umull r2, r3, r1, r0 -; ARM-NEXT: lsr r0, r3, #3 -; ARM-NEXT: add r0, r0, r0, lsl #2 -; ARM-NEXT: sub r0, r1, r0, lsl #1 -; ARM-NEXT: ldr r1, [sp] -; ARM-NEXT: and r1, r1, #-33554432 -; ARM-NEXT: orr r0, r1, r0 -; ARM-NEXT: mov r1, #255 +; ARM-NEXT: ldr r0, [sp] +; ARM-NEXT: mov r1, #40960 +; ARM-NEXT: orr r1, r1, #-33554432 ; ARM-NEXT: orr r0, r0, #40960 -; ARM-NEXT: orr r1, r1, #3840 -; ARM-NEXT: str r0, [sp] ; ARM-NEXT: and r0, r0, r1 -; ARM-NEXT: sub r0, r0, #3 -; ARM-NEXT: rsbs r1, r0, #0 -; ARM-NEXT: adc r0, r0, r1 +; ARM-NEXT: orr r0, r0, #3 +; ARM-NEXT: str r0, [sp] +; ARM-NEXT: mov r0, #1 ; ARM-NEXT: add sp, sp, #4 ; ARM-NEXT: mov pc, lr -; ARM-NEXT: .p2align 2 -; ARM-NEXT: @ %bb.1: -; ARM-NEXT: .LCPI10_0: -; ARM-NEXT: .long 3435973837 @ 0xcccccccd ; ; ARMT2-LABEL: t11: ; ARMT2: @ %bb.0: @ %entry diff --git a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll @@ -88,9 +88,8 @@ ; PPC64LE-NEXT: clrlwi 3, 3, 28 ; PPC64LE-NEXT: ori 4, 4, 52429 ; PPC64LE-NEXT: mulhwu 4, 3, 4 -; PPC64LE-NEXT: rlwinm 5, 4, 0, 0, 29 ; PPC64LE-NEXT: srwi 4, 4, 2 -; PPC64LE-NEXT: add 4, 4, 5 +; PPC64LE-NEXT: rlwimi 4, 4, 2, 28, 29 ; PPC64LE-NEXT: sub 3, 3, 4 ; PPC64LE-NEXT: cntlzw 3, 3 ; PPC64LE-NEXT: not 3, 3 diff --git a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll --- a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll +++ b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll @@ -34,20 +34,20 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-NEXT: movb {{[0-9]+}}(%esp), %bl ; CHECK-NEXT: testb $1, %bl -; CHECK-NEXT: je LBB0_27 +; CHECK-NEXT: je LBB0_25 ; CHECK-NEXT: ## %bb.1: ## %bb116.i -; CHECK-NEXT: je LBB0_27 +; CHECK-NEXT: je LBB0_25 ; CHECK-NEXT: ## %bb.2: ## %bb52.i.i ; CHECK-NEXT: testb $1, %bl -; CHECK-NEXT: je LBB0_27 +; CHECK-NEXT: je LBB0_25 ; CHECK-NEXT: ## %bb.3: ## %bb142.i -; CHECK-NEXT: je LBB0_27 +; CHECK-NEXT: je LBB0_25 ; CHECK-NEXT: ## %bb.4: ; CHECK-NEXT: movl L_.str89$non_lazy_ptr, %edi ; CHECK-NEXT: movb $1, %bh -; CHECK-NEXT: movl $274877907, %ebp ## imm = 0x10624DD3 +; CHECK-NEXT: movl L_.str$non_lazy_ptr, %ebp ; CHECK-NEXT: jmp LBB0_5 -; CHECK-NEXT: LBB0_23: ## %bb7806 +; CHECK-NEXT: LBB0_21: ## %bb7806 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp16: ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -58,7 +58,7 @@ ; CHECK-NEXT: LBB0_5: ## %bb3261 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: cmpl $37, 0 -; CHECK-NEXT: jne LBB0_27 +; CHECK-NEXT: jne LBB0_25 ; CHECK-NEXT: ## %bb.6: ## %bb3306 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp0: @@ -70,7 +70,7 @@ ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: movl 0, %eax ; CHECK-NEXT: cmpl $121, %eax -; CHECK-NEXT: ja LBB0_27 +; CHECK-NEXT: ja LBB0_25 ; CHECK-NEXT: ## %bb.8: ## %bb3314 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: jmpl *LJTI0_0(,%eax,4) @@ -78,11 +78,11 @@ ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne LBB0_27 +; CHECK-NEXT: jne LBB0_25 ; CHECK-NEXT: ## %bb.11: ## %bb5809 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: testb %bh, %bh -; CHECK-NEXT: je LBB0_27 +; CHECK-NEXT: je LBB0_25 ; CHECK-NEXT: ## %bb.12: ## %bb91.i8504 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: testb $1, %bl @@ -98,10 +98,10 @@ ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: testb $1, %bl ; CHECK-NEXT: je LBB0_15 -; CHECK-NEXT: ## %bb.17: ## %bb278.i8617 +; CHECK-NEXT: ## %bb.16: ## %bb278.i8617 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: je LBB0_19 -; CHECK-NEXT: ## %bb.18: ## %bb440.i8663 +; CHECK-NEXT: je LBB0_18 +; CHECK-NEXT: ## %bb.17: ## %bb440.i8663 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp6: ; CHECK-NEXT: movl L_.str4$non_lazy_ptr, %eax @@ -110,39 +110,24 @@ ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl L__ZZNK10wxDateTime5GetTmERKNS_8TimeZoneEE12__FUNCTION__$non_lazy_ptr, %eax ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl L_.str$non_lazy_ptr, %eax -; CHECK-NEXT: movl %eax, (%esp) +; CHECK-NEXT: movl %ebp, (%esp) ; CHECK-NEXT: movl $1717, {{[0-9]+}}(%esp) ## imm = 0x6B5 ; CHECK-NEXT: calll __Z10wxOnAssertPKwiPKcS0_S0_ ; CHECK-NEXT: Ltmp7: -; CHECK-NEXT: LBB0_19: ## %bb448.i8694 -; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: jmp LBB0_20 +; CHECK-NEXT: jmp LBB0_18 ; CHECK-NEXT: LBB0_15: ## %bb187.i8591 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: jne LBB0_27 -; CHECK-NEXT: ## %bb.16: ## %bb265.i8606 +; CHECK-NEXT: jne LBB0_25 +; CHECK-NEXT: LBB0_18: ## %invcont5814 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: imull %ebp -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: shrl $6, %edx -; CHECK-NEXT: addl %eax, %edx -; CHECK-NEXT: imull $1000, %edx, %eax ## imm = 0x3E8 -; CHECK-NEXT: negl %eax -; CHECK-NEXT: LBB0_20: ## %invcont5814 -; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: Ltmp8: -; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, (%esp) ; CHECK-NEXT: calll __ZN8wxString6FormatEPKwz ; CHECK-NEXT: subl $4, %esp ; CHECK-NEXT: Ltmp9: -; CHECK-NEXT: ## %bb.21: ## %invcont5831 +; CHECK-NEXT: ## %bb.19: ## %invcont5831 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp10: ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -160,8 +145,8 @@ ; CHECK-NEXT: movl %eax, (%esp) ; CHECK-NEXT: calll __ZNK10wxDateTime12GetDayOfYearERKNS_8TimeZoneE ; CHECK-NEXT: Ltmp14: -; CHECK-NEXT: jmp LBB0_27 -; CHECK-NEXT: LBB0_22: ## %bb5968 +; CHECK-NEXT: jmp LBB0_25 +; CHECK-NEXT: LBB0_20: ## %bb5968 ; CHECK-NEXT: Ltmp2: ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -169,7 +154,7 @@ ; CHECK-NEXT: calll __ZN8wxString6FormatEPKwz ; CHECK-NEXT: subl $4, %esp ; CHECK-NEXT: Ltmp3: -; CHECK-NEXT: LBB0_27: ## %bb115.critedge.i +; CHECK-NEXT: LBB0_25: ## %bb115.critedge.i ; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: addl $28, %esp ; CHECK-NEXT: popl %esi @@ -177,15 +162,15 @@ ; CHECK-NEXT: popl %ebx ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl $4 -; CHECK-NEXT: LBB0_25: ## %lpad.loopexit.split-lp +; CHECK-NEXT: LBB0_23: ## %lpad.loopexit.split-lp ; CHECK-NEXT: Ltmp15: -; CHECK-NEXT: jmp LBB0_27 -; CHECK-NEXT: LBB0_26: ## %lpad8185 +; CHECK-NEXT: jmp LBB0_25 +; CHECK-NEXT: LBB0_24: ## %lpad8185 ; CHECK-NEXT: Ltmp12: -; CHECK-NEXT: jmp LBB0_27 -; CHECK-NEXT: LBB0_24: ## %lpad.loopexit +; CHECK-NEXT: jmp LBB0_25 +; CHECK-NEXT: LBB0_22: ## %lpad.loopexit ; CHECK-NEXT: Ltmp18: -; CHECK-NEXT: jmp LBB0_27 +; CHECK-NEXT: jmp LBB0_25 ; CHECK-NEXT: Lfunc_end0: entry: br i1 %foo, label %bb116.i, label %bb115.critedge.i diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -693,23 +693,20 @@ define <8 x i16> @pr38477(<8 x i16> %a0) { ; SSE2-LABEL: pr38477: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4957,57457,4103,16385,35545,2048,2115] -; SSE2-NEXT: pmulhuw %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psubw %xmm2, %xmm1 -; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm1 -; SSE2-NEXT: paddw %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4957,57457,4103,16385,35545,2048,2115] +; SSE2-NEXT: pmulhuw %xmm0, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psubw %xmm1, %xmm0 +; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: pr38477: diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll --- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll +++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll @@ -91,13 +91,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: psrld $2, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: pmaddwd {{.*}}(%rip), %xmm2 ; SSE2-NEXT: psubd %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 @@ -113,7 +107,7 @@ ; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE4-NEXT: psrld $2, %xmm2 -; SSE4-NEXT: pmulld {{.*}}(%rip), %xmm2 +; SSE4-NEXT: pmaddwd {{.*}}(%rip), %xmm2 ; SSE4-NEXT: psubd %xmm2, %xmm0 ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm0 @@ -130,8 +124,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6] -; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmaddwd {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 @@ -156,19 +149,12 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $2, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [3,5,6,9] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE2-NEXT: pmuludq %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: psrld $2, %xmm1 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,3,1] +; SSE2-NEXT: pmaddwd {{.*}}(%rip), %xmm2 +; SSE2-NEXT: psubd %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -187,7 +173,7 @@ ; SSE4-NEXT: psrld $2, %xmm2 ; SSE4-NEXT: psrld $1, %xmm1 ; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] -; SSE4-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE4-NEXT: pmaddwd {{.*}}(%rip), %xmm1 ; SSE4-NEXT: psubd %xmm1, %xmm0 ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm0 @@ -204,7 +190,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpmaddwd {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 @@ -292,13 +278,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: psrld $2, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: pmaddwd {{.*}}(%rip), %xmm2 ; SSE2-NEXT: psubd %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 @@ -314,7 +294,7 @@ ; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE4-NEXT: psrld $2, %xmm2 -; SSE4-NEXT: pmulld {{.*}}(%rip), %xmm2 +; SSE4-NEXT: pmaddwd {{.*}}(%rip), %xmm2 ; SSE4-NEXT: psubd %xmm2, %xmm0 ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm0 @@ -331,8 +311,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6] -; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmaddwd {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 diff --git a/llvm/unittests/Support/KnownBitsTest.cpp b/llvm/unittests/Support/KnownBitsTest.cpp --- a/llvm/unittests/Support/KnownBitsTest.cpp +++ b/llvm/unittests/Support/KnownBitsTest.cpp @@ -113,6 +113,8 @@ KnownBits KnownSMax(KnownAnd); KnownBits KnownSMin(KnownAnd); KnownBits KnownMul(KnownAnd); + KnownBits KnownMulHS(KnownAnd); + KnownBits KnownMulHU(KnownAnd); KnownBits KnownUDiv(KnownAnd); KnownBits KnownURem(KnownAnd); KnownBits KnownSRem(KnownAnd); @@ -156,6 +158,14 @@ KnownMul.One &= Res; KnownMul.Zero &= ~Res; + Res = (N1.sext(2 * Bits) * N2.sext(2 * Bits)).extractBits(Bits, Bits); + KnownMulHS.One &= Res; + KnownMulHS.Zero &= ~Res; + + Res = (N1.zext(2 * Bits) * N2.zext(2 * Bits)).extractBits(Bits, Bits); + KnownMulHU.One &= Res; + KnownMulHU.Zero &= ~Res; + if (!N2.isNullValue()) { Res = N1.udiv(N2); KnownUDiv.One &= Res; @@ -218,12 +228,20 @@ EXPECT_EQ(KnownSMin.Zero, ComputedSMin.Zero); EXPECT_EQ(KnownSMin.One, ComputedSMin.One); - // ComputedMul is conservatively correct, but not guaranteed to be + // The following are conservatively correct, but not guaranteed to be // precise. KnownBits ComputedMul = KnownBits::computeForMul(Known1, Known2); EXPECT_TRUE(ComputedMul.Zero.isSubsetOf(KnownMul.Zero)); EXPECT_TRUE(ComputedMul.One.isSubsetOf(KnownMul.One)); + KnownBits ComputedMulHS = KnownBits::mulhs(Known1, Known2); + EXPECT_TRUE(ComputedMulHS.Zero.isSubsetOf(KnownMulHS.Zero)); + EXPECT_TRUE(ComputedMulHS.One.isSubsetOf(KnownMulHS.One)); + + KnownBits ComputedMulHU = KnownBits::mulhu(Known1, Known2); + EXPECT_TRUE(ComputedMulHU.Zero.isSubsetOf(KnownMulHU.Zero)); + EXPECT_TRUE(ComputedMulHU.One.isSubsetOf(KnownMulHU.One)); + KnownBits ComputedUDiv = KnownBits::udiv(Known1, Known2); EXPECT_TRUE(ComputedUDiv.Zero.isSubsetOf(KnownUDiv.Zero)); EXPECT_TRUE(ComputedUDiv.One.isSubsetOf(KnownUDiv.One));