Index: llvm/lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -410,7 +410,7 @@ } // End Predicates = [Has16BitInsts, isGFX10Plus] -class ThreeOpFrag : PatFrag< +class ThreeOpFragSDAG : PatFrag< (ops node:$x, node:$y, node:$z), // When the inner operation is used multiple times, selecting 3-op // instructions may still be beneficial -- if the other users can be @@ -440,7 +440,9 @@ return true; }]> { let PredicateCodeUsesOperands = 1; +} +class ThreeOpFrag : ThreeOpFragSDAG { // The divergence predicate is irrelevant in GlobalISel, as we have // proper register bank checks. We just need to verify the constant // bus restriction when all the sources are considered. @@ -568,6 +570,33 @@ def : OpSelBinOpClampPat; } // End SubtargetPredicate = isGFX9Plus +// FIXME: GlobalISel in general does not handle instructions with 2 results, +// so it cannot use these patterns. +multiclass IMAD32_Pats { + def : GCNPat < + (ThreeOpFrag i32:$src0, i32:$src1, i32:$src2), + (EXTRACT_SUBREG (inst $src0, $src1, + (REG_SEQUENCE SReg_64, // Use scalar and let it be legalized + $src2, sub0, + (i32 (IMPLICIT_DEF)), sub1), + 0 /* clamp */), + sub0) + >; + // Immediate src2 in the pattern above will not fold because it would be partially + // undef. Hence define specialized pattern for this case. + // FIXME: GlobalISel pattern exporter fails to export a pattern like this and asserts, + // make it SDAG only. + def : GCNPat < + (ThreeOpFragSDAG i32:$src0, i32:$src1, (i32 imm:$src2)), + (EXTRACT_SUBREG (inst $src0, $src1, (i64 (as_i64imm $src2)), 0 /* clamp */), sub0) + >; +} + +let SubtargetPredicate = isGFX9GFX10 in // exclude pre-GFX9 where it was slow +defm : IMAD32_Pats; +let SubtargetPredicate = isGFX11Only in +defm : IMAD32_Pats; + def VOP3_PERMLANE_Profile : VOP3_Profile, VOP3_OPSEL> { let Src0RC64 = VRegSrc_32; let Src1RC64 = SCSrc_b32; Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -268,12 +268,11 @@ ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1] ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W64-NEXT: s_endpgm ; @@ -298,12 +297,11 @@ ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm entry: Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -288,12 +288,11 @@ ; GFX1064-NEXT: .LBB1_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1] ; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; @@ -326,12 +325,11 @@ ; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1] ; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -878,11 +876,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2] ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -923,11 +920,10 @@ ; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, s[0:1] -; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v1 +; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2] ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; @@ -967,11 +963,10 @@ ; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 ; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v2, s[0:1] -; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v1 +; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s0, s3, v2, v[1:2] ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -2000,16 +1995,16 @@ ; GFX9-NEXT: .LBB10_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -2048,14 +2043,14 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4 -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v2 +; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5] +; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v3 +; GFX1064-NEXT: v_mov_b32_e32 v1, v4 ; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm @@ -2094,14 +2089,14 @@ ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s0, s2, v2, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s2, v2, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4 -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v2 +; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s0, s3, v2, v[4:5] +; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3 +; GFX1032-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -293,12 +293,11 @@ ; GFX1064-NEXT: .LBB1_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1] ; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; @@ -327,12 +326,11 @@ ; GFX1032-NEXT: .LBB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1] ; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -1012,13 +1010,12 @@ ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -1053,11 +1050,10 @@ ; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, s[4:5] +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5] +; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v1 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -1091,11 +1087,10 @@ ; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5] +; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2] +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v1 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -2176,18 +2171,18 @@ ; GFX9-NEXT: .LBB12_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -2220,13 +2215,13 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4 -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v2 +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 +; GFX1064-NEXT: v_mov_b32_e32 v1, v4 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2260,13 +2255,13 @@ ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s2, s2, v2, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5] +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4 -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v2 +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 +; GFX1032-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -267,12 +267,11 @@ ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1] ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W64-NEXT: s_endpgm ; @@ -297,12 +296,11 @@ ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm entry: Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -276,12 +276,11 @@ ; GFX10W64-NEXT: .LBB1_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1] ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W64-NEXT: s_endpgm ; @@ -307,12 +306,11 @@ ; GFX10W32-NEXT: .LBB1_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1] ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm entry: Index: llvm/test/CodeGen/AMDGPU/mad_64_32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -296,10 +296,12 @@ ; GFX9-LABEL: mad_i64_i32_extops_i32_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, v1 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3] -; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v4, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v4, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %ext0 = sext i32 %arg0 to i64 %ext1 = zext i32 %arg1 to i64 @@ -363,8 +365,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v3, 1, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5] -; GFX9-NEXT: v_mul_lo_u32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v2, v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %trunc.lhs = and i64 %arg0, 8589934591 %trunc.rhs = and i64 %arg1, 4294967295 @@ -400,10 +403,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5] -; GFX9-NEXT: v_mul_lo_u32 v2, v6, v3 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v3, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %trunc.lhs = and i64 %arg0, 4294967295 %trunc.rhs = and i64 %arg1, 8589934591 Index: llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll @@ -0,0 +1,280 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s + +define amdgpu_ps float @mad_i32_vvv(i32 %a, i32 %b, i32 %c) { +; GFX9-LABEL: mad_i32_vvv: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, v1, v[2:3] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: mad_i32_vvv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v0, v1, v[2:3] +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: mad_i32_vvv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v4, v3, v[2:3] +; GFX11-NEXT: ; return to shader part epilog + %mul = mul i32 %a, %b + %add = add i32 %mul, %c + %cast = bitcast i32 %add to float + ret float %cast +} + +define amdgpu_ps float @mad_i32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) { +; GCN-LABEL: mad_i32_sss: +; GCN: ; %bb.0: +; GCN-NEXT: s_mul_i32 s0, s0, s1 +; GCN-NEXT: s_add_i32 s0, s0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog + %mul = mul i32 %a, %b + %add = add i32 %mul, %c + %cast = bitcast i32 %add to float + ret float %cast +} + +define amdgpu_ps float @mad_i32_vvc(i32 %a, i32 %b) { +; GFX9-LABEL: mad_i32_vvc: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, v1, 42 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: mad_i32_vvc: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v0, v1, 42 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: mad_i32_vvc: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v3, v2, 42 +; GFX11-NEXT: ; return to shader part epilog + %mul = mul i32 %a, %b + %add = add i32 %mul, 42 + %cast = bitcast i32 %add to float + ret float %cast +} + +define amdgpu_ps float @mad_i32_vcv(i32 %a, i32 %c) { +; GFX9-LABEL: mad_i32_vcv: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 42, v[1:2] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: mad_i32_vcv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v0, 42, v[1:2] +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: mad_i32_vcv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mad_u64_u32 v[2:3], s0, v0, 42, v[1:2] +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: ; return to shader part epilog + %mul = mul i32 %a, 42 + %add = add i32 %mul, %c + %cast = bitcast i32 %add to float + ret float %cast +} + +define amdgpu_ps float @mad_i32_vcc(i32 %a) { +; GFX9-LABEL: mad_i32_vcc: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 42, 43 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: mad_i32_vcc: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v0, 42, 43 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: mad_i32_vcc: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v2, 42, 43 +; GFX11-NEXT: ; return to shader part epilog + %mul = mul i32 %a, 42 + %add = add i32 %mul, 43 + %cast = bitcast i32 %add to float + ret float %cast +} + +define amdgpu_ps float @mad_i32_vvs(i32 %a, i32 %b, i32 inreg %c) { +; GFX9-LABEL: mad_i32_vvs: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, v1, s[0:1] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: mad_i32_vvs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v0, v1, s[0:1] +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: mad_i32_vvs: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v3, v2, s[0:1] +; GFX11-NEXT: ; return to shader part epilog + %mul = mul i32 %a, %b + %add = add i32 %mul, %c + %cast = bitcast i32 %add to float + ret float %cast +} + +define amdgpu_ps float @mad_i32_vsv(i32 %a, i32 inreg %b, i32 %c) { +; GFX9-LABEL: mad_i32_vsv: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, s0, v[1:2] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: mad_i32_vsv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v0, s0, v[1:2] +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: mad_i32_vsv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mad_u64_u32 v[2:3], s0, v0, s0, v[1:2] +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: ; return to shader part epilog + %mul = mul i32 %a, %b + %add = add i32 %mul, %c + %cast = bitcast i32 %add to float + ret float %cast +} + +define amdgpu_ps float @mad_i32_svv(i32 inreg %a, i32 %b, i32 %c) { +; GFX9-LABEL: mad_i32_svv: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v0, v[1:2] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: mad_i32_svv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, v[1:2] +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: mad_i32_svv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mad_u64_u32 v[2:3], s0, s0, v0, v[1:2] +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: ; return to shader part epilog + %mul = mul i32 %a, %b + %add = add i32 %mul, %c + %cast = bitcast i32 %add to float + ret float %cast +} + +define amdgpu_ps float @mad_i32_vss(i32 %a, i32 inreg %b, i32 inreg %c) { +; GFX9-LABEL: mad_i32_vss: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 +; GFX9-NEXT: v_add_u32_e32 v0, s1, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: mad_i32_vss: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s2, s1 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v0, s0, s[2:3] +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: mad_i32_vss: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_mov_b32 s2, s1 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v2, s0, s[2:3] +; GFX11-NEXT: ; return to shader part epilog + %mul = mul i32 %a, %b + %add = add i32 %mul, %c + %cast = bitcast i32 %add to float + ret float %cast +} + +define amdgpu_ps float @mad_i32_svs(i32 inreg %a, i32 %b, i32 inreg %c) { +; GFX9-LABEL: mad_i32_svs: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s1, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: mad_i32_svs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s2, s1 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3] +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: mad_i32_svs: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_mov_b32 s2, s1 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v2, s[2:3] +; GFX11-NEXT: ; return to shader part epilog + %mul = mul i32 %a, %b + %add = add i32 %mul, %c + %cast = bitcast i32 %add to float + ret float %cast +} + +define amdgpu_ps float @mad_i32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) { +; GFX9-LABEL: mad_i32_ssv: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: mad_i32_ssv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s0, s1, v[0:1] +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: mad_i32_ssv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, s0, s1, v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: ; return to shader part epilog + %mul = mul i32 %a, %b + %add = add i32 %mul, %c + %cast = bitcast i32 %add to float + ret float %cast +} + +define amdgpu_ps float @mad_i32_vvv_multiuse(i32 %a, i32 %b, i32 %c) { +; GFX9-LABEL: mad_i32_vvv_multiuse: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX9-NEXT: v_add_u32_e32 v0, v1, v2 +; GFX9-NEXT: flat_store_dword v[0:1], v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: mad_i32_vvv_multiuse: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v1, v2 +; GFX10-NEXT: flat_store_dword v[0:1], v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: mad_i32_vvv_multiuse: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v1, v2 +; GFX11-NEXT: flat_store_b32 v[0:1], v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog + %mul = mul i32 %a, %b + %add = add i32 %mul, %c + store i32 %mul, i32* undef + %cast = bitcast i32 %add to float + ret float %cast +} Index: llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -74,22 +74,21 @@ ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v0 ; GFX9-NEXT: v_add_u32_e32 v12, v17, v0 -; GFX9-NEXT: v_add_u32_e32 v19, v9, v0 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_madak_f32 v3, v3, v7, 0x3727c5ac ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_mul_u32_u24_e32 v18, v3, v5 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v16 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v3, v13 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, v15 +; GFX9-NEXT: v_add_u32_e32 v19, v3, v16 +; GFX9-NEXT: v_add_u32_e32 v3, v9, v0 +; GFX9-NEXT: v_sub_u32_e32 v3, v3, v18 ; GFX9-NEXT: v_sub_u32_e32 v12, v12, v18 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v19, v13 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v19, v15, v[3:4] ; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v12, v14 -; GFX9-NEXT: v_sub_u32_e32 v18, v19, v18 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GFX9-NEXT: v_add_u32_e32 v3, v18, v3 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v18, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[18:19], 2, v[3:4] +; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_add_co_u32_e64 v18, s[6:7], v10, v18 ; GFX9-NEXT: v_addc_co_u32_e64 v19, s[6:7], v11, v19, s[6:7] ; GFX9-NEXT: global_load_dword v3, v[18:19], off Index: llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -33,13 +33,13 @@ ; MUBUF-NEXT: s_cbranch_execz .LBB0_2 ; MUBUF-NEXT: ; %bb.1: ; %if.then4.i ; MUBUF-NEXT: v_add_nc_u32_e64 v0, 4, 0x4000 +; MUBUF-NEXT: s_mov_b32 s0, 0x41c64e6d ; MUBUF-NEXT: s_clause 0x1 ; MUBUF-NEXT: buffer_load_dword v1, v0, s[36:39], 0 offen ; MUBUF-NEXT: buffer_load_dword v2, v0, s[36:39], 0 offen offset:4 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_add_nc_u32_e32 v0, v2, v1 -; MUBUF-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0 -; MUBUF-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0 +; MUBUF-NEXT: v_mad_u64_u32 v[0:1], s0, v0, s0, 0x3039 ; MUBUF-NEXT: buffer_store_dword v0, v0, s[36:39], 0 offen ; MUBUF-NEXT: .LBB0_2: ; %shader_eval_surface.exit ; MUBUF-NEXT: s_endpgm @@ -67,11 +67,11 @@ ; FLATSCR-NEXT: s_cbranch_execz .LBB0_2 ; FLATSCR-NEXT: ; %bb.1: ; %if.then4.i ; FLATSCR-NEXT: s_movk_i32 vcc_lo, 0x4000 +; FLATSCR-NEXT: s_mov_b32 s0, 0x41c64e6d ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, vcc_lo offset:4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; FLATSCR-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0 -; FLATSCR-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0 +; FLATSCR-NEXT: v_mad_u64_u32 v[0:1], s0, v0, s0, 0x3039 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; FLATSCR-NEXT: .LBB0_2: ; %shader_eval_surface.exit ; FLATSCR-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/udiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/udiv.ll +++ llvm/test/CodeGen/AMDGPU/udiv.ll @@ -2796,33 +2796,32 @@ ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v5, vcc_lo ; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo -; GFX1030-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 -; GFX1030-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v3, vcc_lo -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], s4, 0x186a0, v4, 0 -; GFX1030-NEXT: v_mul_lo_u32 v6, 0x186a0, v5 +; GFX1030-NEXT: v_add_co_u32 v5, vcc_lo, v2, v6 +; GFX1030-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v3, vcc_lo +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], s4, 0x186a0, v5, 0 +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], s4, 0x186a0, v6, v[3:4] ; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 -; GFX1030-NEXT: v_add_nc_u32_e32 v3, v3, v6 ; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX1030-NEXT: v_subrev_co_u32 v2, vcc_lo, 0x186a0, v0 ; GFX1030-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v2 ; GFX1030-NEXT: v_cmp_eq_u32_e64 s4, 0, v1 ; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo -; GFX1030-NEXT: v_add_co_u32 v6, vcc_lo, v4, 2 -; GFX1030-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo +; GFX1030-NEXT: v_add_co_u32 v4, vcc_lo, v5, 2 +; GFX1030-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v6, vcc_lo ; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v0 ; GFX1030-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1030-NEXT: v_cndmask_b32_e64 v0, -1, v0, s4 ; GFX1030-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo -; GFX1030-NEXT: v_add_co_u32 v3, vcc_lo, v4, 1 -; GFX1030-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v5, vcc_lo +; GFX1030-NEXT: v_add_co_u32 v3, vcc_lo, v5, 1 +; GFX1030-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v6, vcc_lo ; GFX1030-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo +; GFX1030-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX1030-NEXT: v_cndmask_b32_e32 v2, v8, v7, vcc_lo ; GFX1030-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo -; GFX1030-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo +; GFX1030-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc_lo +; GFX1030-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc_lo ; GFX1030-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_test_udiv64_mulhi_fold: