diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2764,18 +2764,18 @@ // an inline immediate than -c. // TODO: Also do for 64-bit. def : GCNPat< - (add i32:$src0, (i32 NegSubInlineConst32:$src1)), + (UniformBinFrag i32:$src0, (i32 NegSubInlineConst32:$src1)), (S_SUB_I32 SReg_32:$src0, NegSubInlineConst32:$src1) >; def : GCNPat< - (add i32:$src0, (i32 NegSubInlineConst32:$src1)), + (DivergentBinFrag i32:$src0, (i32 NegSubInlineConst32:$src1)), (V_SUB_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> { let SubtargetPredicate = HasAddNoCarryInsts; } def : GCNPat< - (add i32:$src0, (i32 NegSubInlineConst32:$src1)), + (DivergentBinFrag i32:$src0, (i32 NegSubInlineConst32:$src1)), (V_SUB_CO_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> { let SubtargetPredicate = NotHasAddNoCarryInsts; } diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-negsubinlineconst.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-negsubinlineconst.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-negsubinlineconst.ll @@ -0,0 +1,28 @@ +; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s + +; FUNC-LABEL: {{^}}uniform_add_SIC: +; GCN: S_SUB_I32 killed %{{[0-9]+}}, 32 +define amdgpu_kernel void @uniform_add_SIC(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %a = load i32, i32 addrspace(1)* %in + %result = add i32 %a, -32 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}uniform_add_SIC: +; SI: V_SUB_CO_U32_e64 killed %{{[0-9]+}}, 32 +; GFX900: V_SUB_U32_e64 killed %{{[0-9]+}}, 32 +define amdgpu_kernel void @divergent_add_SIC(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid + %a = load volatile i32, i32 addrspace(1)* %gep + %result = add i32 %a, -32 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable } diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -306,43 +306,43 @@ ; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8 ; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v16 ; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v8 -; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9] -; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] +; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] +; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 -; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8 +; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 ; GCN-NEXT: v_or_b32_e32 v19, v19, v17 ; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 ; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v9 -; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc +; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc +; GCN-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[4:5] ; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 -; GCN-NEXT: v_cndmask_b32_e64 v11, v17, v19, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc ; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v9 ; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v12 -; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GCN-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GCN-NEXT: v_or_b32_e32 v16, v16, v9 -; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13] -; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] -; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12 +; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[12:13] +; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] +; GCN-NEXT: v_subrev_i32_e64 v9, s[8:9], 64, v12 ; GCN-NEXT: v_or_b32_e32 v11, v17, v10 ; GCN-NEXT: v_lshl_b64 v[9:10], v[4:5], v9 ; GCN-NEXT: v_or_b32_e32 v15, v13, v15 ; GCN-NEXT: v_or_b32_e32 v14, v12, v14 -; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7] +; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] -; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc +; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[4:5] ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 ; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v12 ; GCN-NEXT: v_cndmask_b32_e64 v6, v9, v6, s[6:7] -; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GCN-NEXT: v_cndmask_b32_e64 v9, v10, v11, s[4:5] ; GCN-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[4:5] ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = shl <2 x i128> %lhs, %rhs ret <2 x i128> %shl @@ -355,43 +355,43 @@ ; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8 ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8 -; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9] -; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] +; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] +; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 -; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8 +; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 ; GCN-NEXT: v_or_b32_e32 v19, v19, v17 ; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 ; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v9 -; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc +; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc +; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5] ; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 -; GCN-NEXT: v_cndmask_b32_e64 v11, v17, v19, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc ; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9 ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12 -; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v11, v1, s[4:5] ; GCN-NEXT: v_or_b32_e32 v16, v16, v9 -; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13] -; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] -; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12 +; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[12:13] +; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] +; GCN-NEXT: v_subrev_i32_e64 v9, s[8:9], 64, v12 ; GCN-NEXT: v_or_b32_e32 v11, v17, v10 ; GCN-NEXT: v_lshr_b64 v[9:10], v[6:7], v9 ; GCN-NEXT: v_or_b32_e32 v15, v13, v15 ; GCN-NEXT: v_or_b32_e32 v14, v12, v14 -; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7] +; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] -; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc +; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[4:5] ; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v8 ; GCN-NEXT: v_lshr_b64 v[6:7], v[6:7], v12 ; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7] -; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GCN-NEXT: v_cndmask_b32_e64 v9, v10, v11, s[4:5] ; GCN-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v7, 0, v7, s[4:5] ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = lshr <2 x i128> %lhs, %rhs ret <2 x i128> %shl @@ -404,45 +404,45 @@ ; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8 ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8 -; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9] -; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] +; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] +; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 -; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8 +; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 ; GCN-NEXT: v_or_b32_e32 v19, v19, v17 ; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 ; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9 -; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc +; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc +; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5] ; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 -; GCN-NEXT: v_cndmask_b32_e64 v11, v17, v19, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc ; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9 ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12 -; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v11, v1, s[4:5] ; GCN-NEXT: v_or_b32_e32 v16, v16, v9 -; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13] -; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] -; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12 +; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[12:13] +; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] +; GCN-NEXT: v_subrev_i32_e64 v9, s[8:9], 64, v12 ; GCN-NEXT: v_or_b32_e32 v11, v17, v10 ; GCN-NEXT: v_ashr_i64 v[9:10], v[6:7], v9 ; GCN-NEXT: v_or_b32_e32 v15, v13, v15 ; GCN-NEXT: v_or_b32_e32 v14, v12, v14 -; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7] +; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] -; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc +; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[4:5] ; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7] -; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GCN-NEXT: v_cndmask_b32_e64 v9, v10, v11, s[4:5] ; GCN-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GCN-NEXT: v_ashr_i64 v[8:9], v[2:3], v8 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3 -; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v8, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GCN-NEXT: v_ashr_i64 v[8:9], v[6:7], v12 ; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7 -; GCN-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; GCN-NEXT: v_cndmask_b32_e64 v6, v7, v8, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = ashr <2 x i128> %lhs, %rhs ret <2 x i128> %shl