diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1200,17 +1200,27 @@ Depth + 1)) return true; assert(!Known.hasConflict() && "Bits known to be one AND zero?"); - if (SimplifyDemandedBits(Op0, ~Known.Zero & DemandedBits, DemandedElts, - Known2, TLO, Depth + 1)) + APInt Op0DemandedBits = ~Known.Zero & DemandedBits; + if (SimplifyDemandedBits(Op0, Op0DemandedBits, DemandedElts, Known2, TLO, + Depth + 1)) return true; assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); + // If we have learned that some more bits of Op1 are not demanded due to + // known bits in Op0, try simplifying Op1 again. + APInt Op1DemandedBits = ~Known2.Zero & DemandedBits; + if (Op1DemandedBits != DemandedBits && + SimplifyDemandedBits(Op1, Op1DemandedBits, DemandedElts, Known, TLO, + Depth + 1)) + return true; + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + // Attempt to avoid multi-use ops if we don't need anything from them. if (!DemandedBits.isAllOnesValue() || !DemandedElts.isAllOnesValue()) { SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits( - Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1); + Op0, Op0DemandedBits, DemandedElts, TLO.DAG, Depth + 1); SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits( - Op1, DemandedBits, DemandedElts, TLO.DAG, Depth + 1); + Op1, Op1DemandedBits, DemandedElts, TLO.DAG, Depth + 1); if (DemandedOp0 || DemandedOp1) { Op0 = DemandedOp0 ? DemandedOp0 : Op0; Op1 = DemandedOp1 ? DemandedOp1 : Op1; @@ -1252,6 +1262,15 @@ return true; assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); + // If we have learned that some more bits of Op1 are not demanded due to + // known bits in Op0, try simplifying Op1 again. + APInt Op1DemandedBits = ~Known2.One & DemandedBits; + if (Op1DemandedBits != DemandedBits && + SimplifyDemandedBits(Op1, Op1DemandedBits, DemandedElts, Known, TLO, + Depth + 1)) + return true; + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + // Attempt to avoid multi-use ops if we don't need anything from them. if (!DemandedBits.isAllOnesValue() || !DemandedElts.isAllOnesValue()) { SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits( diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -965,34 +965,28 @@ ; GFX7-NEXT: buffer_load_ushort v8, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v1, v2, 8, 8 ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_bfe_i32 v1, v2, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8 ; GFX7-NEXT: v_bfe_i32 v6, v0, 0, 8 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v6 -; GFX7-NEXT: v_bfe_i32 v7, v0, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_ashrrev_i32_e32 v4, 24, v2 +; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8 +; GFX7-NEXT: v_bfe_i32 v5, v0, 16, 8 +; GFX7-NEXT: v_ashrrev_i32_e32 v7, 24, v0 +; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v8 -; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 -; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v6, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, v8 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_mad_u32_u24 v0, v1, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -1862,30 +1862,24 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v8, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b32 s4, 0xff00 ; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v3, s5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v5, s5, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v5 -; GFX7-NEXT: v_and_b32_e32 v6, s5, v0 -; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v1, s5, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s5, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v8, s5, v0 +; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 16 +; GFX7-NEXT: v_bfe_u32 v6, v6, 8, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v8 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v6, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2204,66 +2204,54 @@ ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s4, 0xffff -; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v1, v3, 20, 4 -; GFX7-NEXT: v_bfe_i32 v4, v3, 16, 4 -; GFX7-NEXT: v_bfe_i32 v5, v3, 4, 4 -; GFX7-NEXT: v_bfe_i32 v6, v3, 0, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_i32 v10, v0, 20, 4 -; GFX7-NEXT: v_bfe_i32 v11, v0, 16, 4 -; GFX7-NEXT: v_bfe_i32 v12, v0, 4, 4 -; GFX7-NEXT: v_bfe_i32 v13, v0, 0, 4 -; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; GFX7-NEXT: v_and_b32_e32 v11, v2, v13 -; GFX7-NEXT: v_bfe_i32 v7, v3, 24, 4 -; GFX7-NEXT: v_bfe_i32 v8, v3, 8, 4 +; GFX7-NEXT: v_bfe_i32 v1, v3, 24, 4 +; GFX7-NEXT: v_bfe_i32 v4, v3, 20, 4 +; GFX7-NEXT: v_bfe_i32 v5, v3, 16, 4 +; GFX7-NEXT: v_bfe_i32 v6, v3, 8, 4 +; GFX7-NEXT: v_bfe_i32 v7, v3, 4, 4 +; GFX7-NEXT: v_bfe_i32 v8, v3, 0, 4 ; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v3 ; GFX7-NEXT: v_bfe_i32 v3, v3, 12, 4 -; GFX7-NEXT: v_bfe_i32 v14, v0, 24, 4 -; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4 +; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4 +; GFX7-NEXT: v_bfe_i32 v13, v0, 8, 4 +; GFX7-NEXT: v_bfe_i32 v14, v0, 4, 4 +; GFX7-NEXT: v_bfe_i32 v15, v0, 0, 4 ; GFX7-NEXT: v_ashrrev_i32_e32 v16, 28, v0 ; GFX7-NEXT: v_bfe_i32 v0, v0, 12, 4 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX7-NEXT: v_or_b32_e32 v6, v11, v10 -; GFX7-NEXT: v_and_b32_e32 v12, v2, v14 -; GFX7-NEXT: v_and_b32_e32 v13, v2, v15 -; GFX7-NEXT: v_and_b32_e32 v14, v2, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v6 ; GFX7-NEXT: v_and_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_and_b32_e32 v9, v2, v9 +; GFX7-NEXT: v_and_b32_e32 v13, v2, v13 +; GFX7-NEXT: v_and_b32_e32 v14, v2, v14 +; GFX7-NEXT: v_and_b32_e32 v15, v2, v15 ; GFX7-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_and_b32_e32 v4, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_and_b32_e32 v6, v2, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v2, v2, v5 -; GFX7-NEXT: buffer_load_ushort v5, off, s[0:3], 0 +; GFX7-NEXT: v_and_b32_e32 v2, v2, v16 +; GFX7-NEXT: buffer_load_ushort v16, off, s[0:3], 0 ; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 ; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v12, s4, v12 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v4, v4, v6, v5 -; GFX7-NEXT: v_mad_u32_u24 v4, v16, v11, v4 -; GFX7-NEXT: v_mad_u32_u24 v4, v8, v13, v4 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v4 -; GFX7-NEXT: v_mad_u32_u24 v0, v1, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v15, v10, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v7, v12, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v14, v0 +; GFX7-NEXT: v_mad_u32_u24 v8, v8, v15, v16 +; GFX7-NEXT: v_mad_u32_u24 v7, v7, v14, v8 +; GFX7-NEXT: v_mad_u32_u24 v6, v6, v13, v7 +; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v6 +; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v1, v10, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v9, v2, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2813,95 +2801,91 @@ ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: v_mov_b32_e32 v2, 0xff +; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: v_mov_b32_e32 v3, 0xff ; GFX7-NEXT: s_mov_b32 s5, 0xffff -; GFX7-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX7-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_bfe_i32 v6, v2, 24, 4 +; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4 +; GFX7-NEXT: v_bfe_i32 v8, v2, 16, 4 +; GFX7-NEXT: v_bfe_i32 v9, v2, 12, 4 +; GFX7-NEXT: v_bfe_i32 v10, v2, 8, 4 +; GFX7-NEXT: v_bfe_i32 v11, v2, 4, 4 +; GFX7-NEXT: v_ashrrev_i32_e32 v5, 28, v2 +; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_and_b32_e32 v2, v3, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v6, v4, 20, 4 -; GFX7-NEXT: v_bfe_i32 v7, v4, 16, 4 -; GFX7-NEXT: v_bfe_i32 v8, v4, 12, 4 -; GFX7-NEXT: v_bfe_i32 v9, v4, 8, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_i32 v13, v0, 24, 4 -; GFX7-NEXT: v_bfe_i32 v16, v0, 12, 4 -; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX7-NEXT: v_or_b32_e32 v7, v9, v8 -; GFX7-NEXT: v_and_b32_e32 v9, v2, v13 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v16 -; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 -; GFX7-NEXT: v_bfe_i32 v5, v4, 24, 4 -; GFX7-NEXT: v_bfe_i32 v10, v4, 4, 4 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v4 -; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX7-NEXT: v_ashrrev_i32_e32 v12, 28, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_and_b32_e32 v4, v2, v4 -; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 -; GFX7-NEXT: v_bfe_i32 v15, v0, 16, 4 -; GFX7-NEXT: v_bfe_i32 v17, v0, 8, 4 -; GFX7-NEXT: v_bfe_i32 v18, v0, 4, 4 +; GFX7-NEXT: v_ashrrev_i32_e32 v13, 28, v0 +; GFX7-NEXT: v_bfe_i32 v14, v0, 24, 4 +; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 +; GFX7-NEXT: v_bfe_i32 v16, v0, 16, 4 +; GFX7-NEXT: v_bfe_i32 v17, v0, 12, 4 +; GFX7-NEXT: v_bfe_i32 v18, v0, 8, 4 +; GFX7-NEXT: v_bfe_i32 v19, v0, 4, 4 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v11 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v14 -; GFX7-NEXT: v_and_b32_e32 v11, v2, v15 -; GFX7-NEXT: v_and_b32_e32 v14, v2, v17 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v18 -; GFX7-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v15 -; GFX7-NEXT: v_or_b32_e32 v8, v9, v8 -; GFX7-NEXT: v_or_b32_e32 v9, v11, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v6, s5, v6 -; GFX7-NEXT: v_or_b32_e32 v10, v14, v13 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v12 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v9 +; GFX7-NEXT: v_or_b32_e32 v10, v2, v11 +; GFX7-NEXT: v_or_b32_e32 v7, v8, v7 +; GFX7-NEXT: v_and_b32_e32 v12, v3, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v15 +; GFX7-NEXT: v_and_b32_e32 v15, v3, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v17 +; GFX7-NEXT: v_and_b32_e32 v17, v3, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v18, 8, v19 ; GFX7-NEXT: v_and_b32_e32 v0, v3, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v4, s5, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v7 -; GFX7-NEXT: v_and_b32_e32 v7, v3, v9 -; GFX7-NEXT: v_or_b32_e32 v3, v7, v6 -; GFX7-NEXT: v_and_b32_e32 v7, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v13, v2, v0 -; GFX7-NEXT: v_bfe_u32 v8, v4, 8, 8 -; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_and_b32_e32 v12, v2, v12 -; GFX7-NEXT: v_and_b32_e32 v9, v2, v5 -; GFX7-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_bfe_u32 v10, v5, 8, 8 -; GFX7-NEXT: v_bfe_u32 v15, v3, 8, 8 -; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v5, v3, v5 +; GFX7-NEXT: v_and_b32_e32 v3, v3, v13 +; GFX7-NEXT: v_or_b32_e32 v11, v12, v11 +; GFX7-NEXT: v_or_b32_e32 v12, v15, v14 +; GFX7-NEXT: v_or_b32_e32 v14, v0, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v7, s5, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_and_b32_e32 v10, s5, v10 +; GFX7-NEXT: v_or_b32_e32 v13, v17, v16 +; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX7-NEXT: v_or_b32_e32 v7, v10, v9 +; GFX7-NEXT: v_and_b32_e32 v10, v4, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; GFX7-NEXT: v_and_b32_e32 v4, v4, v14 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v11 +; GFX7-NEXT: v_bfe_u32 v11, v7, 8, 8 +; GFX7-NEXT: v_bfe_u32 v14, v4, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v7, v7, v13, v16 -; GFX7-NEXT: v_mad_u32_u24 v7, v8, v14, v7 -; GFX7-NEXT: v_mad_u32_u24 v0, v4, v0, v7 -; GFX7-NEXT: v_mad_u32_u24 v0, v6, v11, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v10, v15, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 24, v4 +; GFX7-NEXT: v_bfe_u32 v7, v7, 16, 8 +; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 +; GFX7-NEXT: v_mad_u32_u24 v0, v11, v14, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v7, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v10, v13, v0 +; GFX7-NEXT: v_bfe_u32 v12, v6, 8, 8 +; GFX7-NEXT: v_bfe_u32 v16, v9, 8, 8 +; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 +; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 +; GFX7-NEXT: v_bfe_u32 v9, v9, 16, 8 +; GFX7-NEXT: v_mad_u32_u24 v0, v12, v16, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v6, v9, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v3, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v1, v12, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2108,10 +2108,10 @@ ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 ; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 @@ -2119,45 +2119,32 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v16, off, s[0:3], 0 -; GFX7-NEXT: s_mov_b32 s4, 0xf0000 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_u32 v7, v2, 20, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 12, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 28, v2 ; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v4, v2, 12, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v6, 15, v2 -; GFX7-NEXT: v_alignbit_b32 v2, v7, v2, 16 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v8 +; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4 +; GFX7-NEXT: v_bfe_u32 v6, v2, 12, 4 +; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4 +; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 12, v0 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v8 -; GFX7-NEXT: v_and_b32_e32 v13, 15, v0 -; GFX7-NEXT: v_or_b32_e32 v7, v13, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX7-NEXT: v_and_b32_e32 v7, 15, v7 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v6, v6, v7, v16 -; GFX7-NEXT: v_bfe_u32 v12, v0, 8, 4 -; GFX7-NEXT: v_mad_u32_u24 v6, v8, v13, v6 -; GFX7-NEXT: v_bfe_u32 v14, v0, 20, 4 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0 ; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v11, v0, 12, 4 -; GFX7-NEXT: v_alignbit_b32 v0, v14, v0, 16 -; GFX7-NEXT: v_mad_u32_u24 v5, v5, v12, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4 +; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4 +; GFX7-NEXT: v_bfe_u32 v13, v0, 12, 4 +; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: v_mad_u32_u24 v4, v4, v11, v5 -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v4 -; GFX7-NEXT: v_mad_u32_u24 v0, v15, v14, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v16 +; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v9, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -2479,76 +2466,72 @@ ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_movk_i32 s4, 0xf00 ; GFX7-NEXT: v_mov_b32_e32 v3, 0xf00 ; GFX7-NEXT: s_movk_i32 s5, 0xf0f ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 28, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 4, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 12, v2 -; GFX7-NEXT: v_bfe_u32 v1, v2, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v5, 15, v2 -; GFX7-NEXT: v_bfe_u32 v7, v2, 16, 4 -; GFX7-NEXT: v_alignbit_b32 v2, v6, v2, 24 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 28, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 12, v2 +; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v6, 15, v2 +; GFX7-NEXT: v_bfe_u32 v8, v2, 16, 4 +; GFX7-NEXT: v_alignbit_b32 v2, v7, v2, 24 +; GFX7-NEXT: v_and_b32_e32 v7, s4, v10 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 4, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_and_b32_e32 v6, v3, v9 -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v3, v3, v11 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 -; GFX7-NEXT: v_and_b32_e32 v12, 15, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v13, 28, v0 -; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 4, v0 +; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v10, v3, v10 +; GFX7-NEXT: v_bfe_u32 v11, v0, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v3, v3, v12 +; GFX7-NEXT: v_and_b32_e32 v13, 15, v0 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 28, v0 +; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 ; GFX7-NEXT: v_and_b32_e32 v2, s5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 12, v0 -; GFX7-NEXT: v_bfe_u32 v14, v0, 16, 4 -; GFX7-NEXT: v_alignbit_b32 v0, v13, v0, 24 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 12, v0 +; GFX7-NEXT: v_bfe_u32 v15, v0, 16, 4 +; GFX7-NEXT: v_alignbit_b32 v0, v14, v0, 24 +; GFX7-NEXT: v_or_b32_e32 v7, v6, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v10, v13, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v5, s4, v16 ; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 -; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX7-NEXT: v_or_b32_e32 v9, v8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v6, 15, v1 -; GFX7-NEXT: v_and_b32_e32 v12, 15, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_or_b32_e32 v4, v14, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v9, v2 +; GFX7-NEXT: v_or_b32_e32 v5, v15, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 4 -; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 4 +; GFX7-NEXT: v_bfe_u32 v9, v4, 8, 4 +; GFX7-NEXT: v_bfe_u32 v14, v3, 8, 4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v6, v6, v12, v16 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v3 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v3 +; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 4 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 4 -; GFX7-NEXT: v_mad_u32_u24 v6, v7, v13, v6 -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v6 -; GFX7-NEXT: v_and_b32_e32 v8, 15, v2 -; GFX7-NEXT: v_and_b32_e32 v14, 15, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v10, v1 -; GFX7-NEXT: v_bfe_u32 v9, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 -; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v9, v14, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v3, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v11, v1 +; GFX7-NEXT: v_bfe_u32 v10, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v16, v0, 8, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v0 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 4 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 4 -; GFX7-NEXT: v_mad_u32_u24 v1, v9, v15, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v10, v16, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v11, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v7, v12, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/RISCV/srem-lkk.ll b/llvm/test/CodeGen/RISCV/srem-lkk.ll --- a/llvm/test/CodeGen/RISCV/srem-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-lkk.ll @@ -51,8 +51,9 @@ ; RV64IM-NEXT: addiw a2, a2, 389 ; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 32 +; RV64IM-NEXT: add a2, a1, a0 +; RV64IM-NEXT: srliw a2, a2, 31 ; RV64IM-NEXT: addw a1, a1, a0 -; RV64IM-NEXT: srliw a2, a1, 31 ; RV64IM-NEXT: srli a1, a1, 6 ; RV64IM-NEXT: add a1, a1, a2 ; RV64IM-NEXT: addi a2, zero, 95 @@ -292,9 +293,8 @@ ; RV64IM-NEXT: addiw a2, a2, 389 ; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 32 -; RV64IM-NEXT: addw a2, a1, a0 -; RV64IM-NEXT: srliw a2, a2, 31 ; RV64IM-NEXT: add a1, a1, a0 +; RV64IM-NEXT: srliw a2, a1, 31 ; RV64IM-NEXT: sraiw a1, a1, 6 ; RV64IM-NEXT: add a1, a1, a2 ; RV64IM-NEXT: addi a2, zero, 95 diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll @@ -331,32 +331,32 @@ ; RV32-NEXT: sw s6, 0(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 ; RV32-NEXT: lw a0, 4(a0) -; RV32-NEXT: lbu a1, 12(s0) -; RV32-NEXT: lw a2, 8(s0) -; RV32-NEXT: andi a3, a0, 1 -; RV32-NEXT: neg s2, a3 -; RV32-NEXT: slli a3, a1, 30 -; RV32-NEXT: srli a4, a2, 2 -; RV32-NEXT: or s3, a4, a3 -; RV32-NEXT: srli a1, a1, 2 -; RV32-NEXT: andi a1, a1, 1 -; RV32-NEXT: neg s1, a1 -; RV32-NEXT: slli a1, a2, 31 +; RV32-NEXT: lw a1, 8(s0) +; RV32-NEXT: andi a2, a0, 1 +; RV32-NEXT: neg s2, a2 +; RV32-NEXT: slli a2, a1, 31 ; RV32-NEXT: srli a0, a0, 1 -; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: or s3, a0, a2 +; RV32-NEXT: lbu a2, 12(s0) +; RV32-NEXT: srli a0, a1, 1 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: neg s1, a0 +; RV32-NEXT: slli a0, a2, 30 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: lw s4, 0(s0) -; RV32-NEXT: srli a1, a2, 1 +; RV32-NEXT: srli a1, a2, 2 ; RV32-NEXT: andi a1, a1, 1 ; RV32-NEXT: neg a1, a1 -; RV32-NEXT: addi a2, zero, 7 -; RV32-NEXT: mv a3, zero +; RV32-NEXT: addi a2, zero, -5 +; RV32-NEXT: addi a3, zero, -1 ; RV32-NEXT: call __moddi3@plt ; RV32-NEXT: mv s5, a0 ; RV32-NEXT: mv s6, a1 -; RV32-NEXT: addi a2, zero, -5 -; RV32-NEXT: addi a3, zero, -1 +; RV32-NEXT: addi a2, zero, 7 ; RV32-NEXT: mv a0, s3 ; RV32-NEXT: mv a1, s1 +; RV32-NEXT: mv a3, zero ; RV32-NEXT: call __moddi3@plt ; RV32-NEXT: mv s1, a0 ; RV32-NEXT: mv s3, a1 @@ -365,10 +365,10 @@ ; RV32-NEXT: mv a1, s2 ; RV32-NEXT: mv a3, zero ; RV32-NEXT: call __moddi3@plt -; RV32-NEXT: xori a2, s1, 2 +; RV32-NEXT: xori a2, s1, 1 ; RV32-NEXT: or a2, a2, s3 ; RV32-NEXT: snez a2, a2 -; RV32-NEXT: xori a3, s5, 1 +; RV32-NEXT: xori a3, s5, 2 ; RV32-NEXT: or a3, a3, s6 ; RV32-NEXT: snez a3, a3 ; RV32-NEXT: or a0, a0, a1 @@ -377,20 +377,18 @@ ; RV32-NEXT: neg a4, a2 ; RV32-NEXT: neg a5, a0 ; RV32-NEXT: sw a5, 0(s0) -; RV32-NEXT: slli a3, a3, 1 -; RV32-NEXT: sub a0, a0, a3 +; RV32-NEXT: slli a2, a2, 1 +; RV32-NEXT: sub a0, a0, a2 ; RV32-NEXT: sw a0, 4(s0) -; RV32-NEXT: slli a0, a2, 2 -; RV32-NEXT: srli a2, a4, 30 -; RV32-NEXT: sub a2, a2, a0 -; RV32-NEXT: andi a2, a2, 7 -; RV32-NEXT: sb a2, 12(s0) -; RV32-NEXT: srli a2, a1, 31 -; RV32-NEXT: andi a1, a1, 1 -; RV32-NEXT: slli a1, a1, 1 -; RV32-NEXT: or a1, a2, a1 -; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: srli a0, a4, 31 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: slli a2, a3, 2 +; RV32-NEXT: sub a0, a0, a2 ; RV32-NEXT: sw a0, 8(s0) +; RV32-NEXT: srli a0, a1, 30 +; RV32-NEXT: sub a0, a0, a2 +; RV32-NEXT: andi a0, a0, 7 +; RV32-NEXT: sb a0, 12(s0) ; RV32-NEXT: lw s6, 0(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s5, 4(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s4, 8(sp) # 4-byte Folded Reload @@ -488,32 +486,32 @@ ; RV32M-NEXT: sw s6, 0(sp) # 4-byte Folded Spill ; RV32M-NEXT: mv s0, a0 ; RV32M-NEXT: lw a0, 4(a0) -; RV32M-NEXT: lbu a1, 12(s0) -; RV32M-NEXT: lw a2, 8(s0) -; RV32M-NEXT: andi a3, a0, 1 -; RV32M-NEXT: neg s2, a3 -; RV32M-NEXT: slli a3, a1, 30 -; RV32M-NEXT: srli a4, a2, 2 -; RV32M-NEXT: or s3, a4, a3 -; RV32M-NEXT: srli a1, a1, 2 -; RV32M-NEXT: andi a1, a1, 1 -; RV32M-NEXT: neg s1, a1 -; RV32M-NEXT: slli a1, a2, 31 +; RV32M-NEXT: lw a1, 8(s0) +; RV32M-NEXT: andi a2, a0, 1 +; RV32M-NEXT: neg s2, a2 +; RV32M-NEXT: slli a2, a1, 31 ; RV32M-NEXT: srli a0, a0, 1 -; RV32M-NEXT: or a0, a0, a1 +; RV32M-NEXT: or s3, a0, a2 +; RV32M-NEXT: lbu a2, 12(s0) +; RV32M-NEXT: srli a0, a1, 1 +; RV32M-NEXT: andi a0, a0, 1 +; RV32M-NEXT: neg s1, a0 +; RV32M-NEXT: slli a0, a2, 30 +; RV32M-NEXT: srli a1, a1, 2 +; RV32M-NEXT: or a0, a1, a0 ; RV32M-NEXT: lw s4, 0(s0) -; RV32M-NEXT: srli a1, a2, 1 +; RV32M-NEXT: srli a1, a2, 2 ; RV32M-NEXT: andi a1, a1, 1 ; RV32M-NEXT: neg a1, a1 -; RV32M-NEXT: addi a2, zero, 7 -; RV32M-NEXT: mv a3, zero +; RV32M-NEXT: addi a2, zero, -5 +; RV32M-NEXT: addi a3, zero, -1 ; RV32M-NEXT: call __moddi3@plt ; RV32M-NEXT: mv s5, a0 ; RV32M-NEXT: mv s6, a1 -; RV32M-NEXT: addi a2, zero, -5 -; RV32M-NEXT: addi a3, zero, -1 +; RV32M-NEXT: addi a2, zero, 7 ; RV32M-NEXT: mv a0, s3 ; RV32M-NEXT: mv a1, s1 +; RV32M-NEXT: mv a3, zero ; RV32M-NEXT: call __moddi3@plt ; RV32M-NEXT: mv s1, a0 ; RV32M-NEXT: mv s3, a1 @@ -522,10 +520,10 @@ ; RV32M-NEXT: mv a1, s2 ; RV32M-NEXT: mv a3, zero ; RV32M-NEXT: call __moddi3@plt -; RV32M-NEXT: xori a2, s1, 2 +; RV32M-NEXT: xori a2, s1, 1 ; RV32M-NEXT: or a2, a2, s3 ; RV32M-NEXT: snez a2, a2 -; RV32M-NEXT: xori a3, s5, 1 +; RV32M-NEXT: xori a3, s5, 2 ; RV32M-NEXT: or a3, a3, s6 ; RV32M-NEXT: snez a3, a3 ; RV32M-NEXT: or a0, a0, a1 @@ -534,20 +532,18 @@ ; RV32M-NEXT: neg a4, a2 ; RV32M-NEXT: neg a5, a0 ; RV32M-NEXT: sw a5, 0(s0) -; RV32M-NEXT: slli a3, a3, 1 -; RV32M-NEXT: sub a0, a0, a3 +; RV32M-NEXT: slli a2, a2, 1 +; RV32M-NEXT: sub a0, a0, a2 ; RV32M-NEXT: sw a0, 4(s0) -; RV32M-NEXT: slli a0, a2, 2 -; RV32M-NEXT: srli a2, a4, 30 -; RV32M-NEXT: sub a2, a2, a0 -; RV32M-NEXT: andi a2, a2, 7 -; RV32M-NEXT: sb a2, 12(s0) -; RV32M-NEXT: srli a2, a1, 31 -; RV32M-NEXT: andi a1, a1, 1 -; RV32M-NEXT: slli a1, a1, 1 -; RV32M-NEXT: or a1, a2, a1 -; RV32M-NEXT: sub a0, a1, a0 +; RV32M-NEXT: srli a0, a4, 31 +; RV32M-NEXT: or a0, a0, a2 +; RV32M-NEXT: slli a2, a3, 2 +; RV32M-NEXT: sub a0, a0, a2 ; RV32M-NEXT: sw a0, 8(s0) +; RV32M-NEXT: srli a0, a1, 30 +; RV32M-NEXT: sub a0, a0, a2 +; RV32M-NEXT: andi a0, a0, 7 +; RV32M-NEXT: sb a0, 12(s0) ; RV32M-NEXT: lw s6, 0(sp) # 4-byte Folded Reload ; RV32M-NEXT: lw s5, 4(sp) # 4-byte Folded Reload ; RV32M-NEXT: lw s4, 8(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll @@ -58,7 +58,6 @@ ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r3, s0 @@ -209,7 +208,6 @@ ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 ; CHECK-NEXT: vmov.u16 r1, q0[1] ; CHECK-NEXT: vmov.u16 r2, q0[0] ; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 @@ -226,7 +224,6 @@ ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: vmov q3[2], q3[0], r0, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r0, r3 ; CHECK-NEXT: vmov.u16 r0, q0[3] ; CHECK-NEXT: vmov.u16 r3, q0[2] ; CHECK-NEXT: vmov q4[2], q4[0], r3, r0 @@ -253,7 +250,6 @@ ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 ; CHECK-NEXT: vmov.u16 r0, q0[5] ; CHECK-NEXT: vmov.u16 r3, q0[4] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 @@ -272,7 +268,6 @@ ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r1, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 ; CHECK-NEXT: vmov.u16 r1, q0[7] ; CHECK-NEXT: vmov.u16 r3, q0[6] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r1 @@ -459,7 +454,6 @@ ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 @@ -804,7 +798,6 @@ ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r1 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r1 ; CHECK-NEXT: vmov.u8 r1, q0[1] ; CHECK-NEXT: vmov.u8 r2, q0[0] ; CHECK-NEXT: vmov q7[2], q7[0], r2, r1 @@ -821,7 +814,6 @@ ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: vmov q6[2], q6[0], r0, r3 -; CHECK-NEXT: vmov q6[3], q6[1], r0, r3 ; CHECK-NEXT: vmov.u8 r0, q0[3] ; CHECK-NEXT: vmov.u8 r3, q0[2] ; CHECK-NEXT: vmov q7[2], q7[0], r3, r0 @@ -848,7 +840,6 @@ ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q5[2], q5[0], r0, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r0, r3 ; CHECK-NEXT: vmov.u8 r0, q0[5] ; CHECK-NEXT: vmov.u8 r3, q0[4] ; CHECK-NEXT: vmov q6[2], q6[0], r3, r0 @@ -867,7 +858,6 @@ ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov q5[2], q5[0], r1, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r1, r3 ; CHECK-NEXT: vmov.u8 r1, q0[7] ; CHECK-NEXT: vmov.u8 r3, q0[6] ; CHECK-NEXT: vmov q6[2], q6[0], r3, r1 @@ -912,7 +902,6 @@ ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q3[2], q3[0], r0, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r0, r3 ; CHECK-NEXT: vmov.u8 r0, q0[9] ; CHECK-NEXT: vmov.u8 r3, q0[8] ; CHECK-NEXT: vmov q4[2], q4[0], r3, r0 @@ -931,7 +920,6 @@ ; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: adc.w r1, r1, r12 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q0[11] ; CHECK-NEXT: vmov.u8 r3, q0[10] ; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 @@ -958,7 +946,6 @@ ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 ; CHECK-NEXT: vmov.u8 r0, q0[13] ; CHECK-NEXT: vmov.u8 r3, q0[12] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 @@ -977,7 +964,6 @@ ; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: adc.w r1, r1, r12 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q0[15] ; CHECK-NEXT: vmov.u8 r3, q0[14] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 @@ -1279,7 +1265,6 @@ ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 ; CHECK-NEXT: vmov.u16 r1, q0[1] ; CHECK-NEXT: vmov.u16 r2, q0[0] ; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 @@ -1296,7 +1281,6 @@ ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: vmov q3[2], q3[0], r0, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r0, r3 ; CHECK-NEXT: vmov.u16 r0, q0[3] ; CHECK-NEXT: vmov.u16 r3, q0[2] ; CHECK-NEXT: vmov q4[2], q4[0], r3, r0 @@ -1323,7 +1307,6 @@ ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 ; CHECK-NEXT: vmov.u16 r0, q0[5] ; CHECK-NEXT: vmov.u16 r3, q0[4] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 @@ -1342,7 +1325,6 @@ ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r1, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 ; CHECK-NEXT: vmov.u16 r1, q0[7] ; CHECK-NEXT: vmov.u16 r3, q0[6] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r1 @@ -1541,7 +1523,6 @@ ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 @@ -1693,7 +1674,6 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 @@ -1859,7 +1839,6 @@ ; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q3[2], q3[0], r3, r12 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r12 ; CHECK-NEXT: vmov.u16 r12, q0[1] ; CHECK-NEXT: vmov.u16 r3, q0[0] ; CHECK-NEXT: vmov q4[2], q4[0], r3, r12 @@ -1876,7 +1855,6 @@ ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[3] ; CHECK-NEXT: vmov.u16 r3, q0[2] ; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 @@ -1903,7 +1881,6 @@ ; CHECK-NEXT: rsbs r4, r4, #0 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r4, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r3 ; CHECK-NEXT: vmov.u16 r3, q0[5] ; CHECK-NEXT: vmov.u16 r4, q0[4] ; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 @@ -1922,7 +1899,6 @@ ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov.u16 r3, q0[6] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 @@ -2085,7 +2061,6 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, s3 ; CHECK-NEXT: vmov r3, s1 @@ -2344,7 +2319,6 @@ ; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q6[2], q6[0], r3, r12 -; CHECK-NEXT: vmov q6[3], q6[1], r3, r12 ; CHECK-NEXT: vmov.u8 r12, q0[1] ; CHECK-NEXT: vmov.u8 r3, q0[0] ; CHECK-NEXT: vmov q7[2], q7[0], r3, r12 @@ -2361,7 +2335,6 @@ ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q0[3] ; CHECK-NEXT: vmov.u8 r3, q0[2] ; CHECK-NEXT: vmov q7[2], q7[0], r3, r2 @@ -2388,7 +2361,6 @@ ; CHECK-NEXT: rsbs r4, r4, #0 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q5[2], q5[0], r4, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r4, r3 ; CHECK-NEXT: vmov.u8 r3, q0[5] ; CHECK-NEXT: vmov.u8 r4, q0[4] ; CHECK-NEXT: vmov q6[2], q6[0], r4, r3 @@ -2407,7 +2379,6 @@ ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov.u8 r3, q0[6] ; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 @@ -2452,7 +2423,6 @@ ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: rsbs r4, r4, #0 ; CHECK-NEXT: vmov q3[2], q3[0], r3, r4 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r4 ; CHECK-NEXT: vmov.u8 r3, q0[9] ; CHECK-NEXT: vmov.u8 r4, q0[8] ; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 @@ -2471,7 +2441,6 @@ ; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: adc.w r3, r3, r12 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r4 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r4 ; CHECK-NEXT: vmov.u8 r2, q0[11] ; CHECK-NEXT: vmov.u8 r4, q0[10] ; CHECK-NEXT: vmov q4[2], q4[0], r4, r2 @@ -2498,7 +2467,6 @@ ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: rsbs r4, r4, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r4 ; CHECK-NEXT: vmov.u8 r2, q0[13] ; CHECK-NEXT: vmov.u8 r4, q0[12] ; CHECK-NEXT: vmov q3[2], q3[0], r4, r2 @@ -2517,7 +2485,6 @@ ; CHECK-NEXT: rsbs r4, r4, #0 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r3, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 ; CHECK-NEXT: vmov.u8 r3, q0[15] ; CHECK-NEXT: vmov.u8 r4, q0[14] ; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 @@ -2819,7 +2786,6 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, s3 ; CHECK-NEXT: vmov r3, s1 diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -3930,8 +3930,6 @@ ; SSE-NEXT: shrl $15, %ecx ; SSE-NEXT: movl %eax, %edx ; SSE-NEXT: shrl $8, %edx -; SSE-NEXT: andl $1, %edx -; SSE-NEXT: andl $8, %eax ; SSE-NEXT: shrl $3, %eax ; SSE-NEXT: xorl %edx, %eax ; SSE-NEXT: andl %ecx, %eax @@ -3946,8 +3944,6 @@ ; AVX1OR2-NEXT: shrl $15, %ecx ; AVX1OR2-NEXT: movl %eax, %edx ; AVX1OR2-NEXT: shrl $8, %edx -; AVX1OR2-NEXT: andl $1, %edx -; AVX1OR2-NEXT: andl $8, %eax ; AVX1OR2-NEXT: shrl $3, %eax ; AVX1OR2-NEXT: xorl %edx, %eax ; AVX1OR2-NEXT: andl %ecx, %eax diff --git a/llvm/test/CodeGen/X86/pr34137.ll b/llvm/test/CodeGen/X86/pr34137.ll --- a/llvm/test/CodeGen/X86/pr34137.ll +++ b/llvm/test/CodeGen/X86/pr34137.ll @@ -13,7 +13,6 @@ ; CHECK-NEXT: andl %eax, %ecx ; CHECK-NEXT: movl %eax, %edx ; CHECK-NEXT: andl %ecx, %edx -; CHECK-NEXT: movzwl %dx, %edx ; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: testw %cx, %ax diff --git a/llvm/test/CodeGen/X86/shift-parts.ll b/llvm/test/CodeGen/X86/shift-parts.ll --- a/llvm/test/CodeGen/X86/shift-parts.ll +++ b/llvm/test/CodeGen/X86/shift-parts.ll @@ -12,11 +12,12 @@ ; CHECK-NEXT: movq g_144+{{.*}}(%rip), %rax ; CHECK-NEXT: movq g_144+{{.*}}(%rip), %rcx ; CHECK-NEXT: movzbl %sil, %edx +; CHECK-NEXT: andl $1, %edx ; CHECK-NEXT: shll $6, %edx ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_1: # %for.cond ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testb $64, %dl +; CHECK-NEXT: testb %dl, %dl ; CHECK-NEXT: movq %rcx, %rsi ; CHECK-NEXT: cmovneq %rax, %rsi ; CHECK-NEXT: orl $0, %esi