Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -202,6 +202,7 @@ SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const; + SDValue reassociateSub(SDNode *N, SelectionDAG &DAG) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11131,6 +11131,105 @@ return DAG.getNode(Opc, SL, VT, Add1, Op2); } +SDValue SITargetLowering::reassociateSub(SDNode *N, SelectionDAG &DAG) const { + EVT VT = N->getValueType(0); + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + unsigned Opc = N->getOpcode(); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + if (!(Op0->isDivergent() ^ Op1->isDivergent())) + return SDValue(); + + // Actually we can optimize the case with constant, but it looks there are + // some patterns in DAGCombiner to revert the opt cause dead loop. So for now + // we disable this optimization for any op is constant. + if (isa(Op0) || isa(Op1)) + return SDValue(); + + SDLoc SL(N); + if (Op1->isDivergent() && Op1->hasOneUse()) { + unsigned Op1Opc = Op1.getOpcode(); + if (Op1Opc != ISD::ADD && Op1Opc != ISD::SUB) + return SDValue(); + + SDValue Op2 = Op1.getOperand(1); + Op1 = Op1.getOperand(0); + if (isa(Op1) || isa(Op2)) + return SDValue(); + + if (Opc == ISD::ADD && Op1Opc == ISD::SUB) { + // s0 + (s1 - v0) --> (s0 + s1) - v0 + if (!Op1->isDivergent() && Op2->isDivergent()) + return DAG.getNode(ISD::SUB, SL, VT, + DAG.getNode(ISD::ADD, SL, VT, Op0, Op1), Op2); + // s0 + (v0 - s1) --> (s0 - s1) + v0 + if (Op1->isDivergent() && !Op2->isDivergent()) + return DAG.getNode(ISD::ADD, SL, VT, + DAG.getNode(ISD::SUB, SL, VT, Op0, Op2), Op1); + } else if (Opc == ISD::SUB) { + if (Op1Opc == ISD::SUB) { + // s0 - (s1 - v0) --> (s0 - s1) + v0 + if (!Op1->isDivergent() && Op2->isDivergent()) + return DAG.getNode(ISD::ADD, SL, VT, + DAG.getNode(ISD::SUB, SL, VT, Op0, Op1), Op2); + // s0 - (v0 - s1) --> (s0 + s1) - v0 + if (Op1->isDivergent() && !Op2->isDivergent()) + return DAG.getNode(ISD::SUB, SL, VT, + DAG.getNode(ISD::ADD, SL, VT, Op0, Op2), Op1); + } else if (Op1Opc == ISD::ADD) { + // s0 - (s1 + v0) --> (s0 - s1) - v0 + if (Op1->isDivergent() ^ Op2->isDivergent()) { + if (Op1->isDivergent()) + std::swap(Op1, Op2); + return DAG.getNode(ISD::SUB, SL, VT, + DAG.getNode(ISD::SUB, SL, VT, Op0, Op1), Op2); + } + } + } + } + + if (Op0->isDivergent() && Op0->hasOneUse()) { + unsigned Op0Opc = Op0.getOpcode(); + if (Op0Opc != ISD::ADD && Op0Opc != ISD::SUB) + return SDValue(); + + SDValue Op2 = Op0.getOperand(1); + Op0 = Op0.getOperand(0); + if (isa(Op1) || isa(Op2)) + return SDValue(); + + if (!Op0->isDivergent() && Op2->isDivergent()) { + if (Opc == ISD::SUB) { + // (s1 + v0) - s0 --> (s1 - s0) + v0 + if (Op0Opc == ISD::ADD) + return DAG.getNode(ISD::ADD, SL, VT, + DAG.getNode(ISD::SUB, SL, VT, Op0, Op1), Op2); + + // (s1 - v0) - s0 --> (s1 - s0) - v0 + if (Op0Opc == ISD::SUB) + return DAG.getNode(ISD::SUB, SL, VT, + DAG.getNode(ISD::SUB, SL, VT, Op0, Op1), Op2); + } else if (Opc == ISD::ADD && Op0Opc == ISD::SUB) { + // (s1 - v0) + s0 --> (s0 + s1) - v0 + return DAG.getNode(ISD::SUB, SL, VT, + DAG.getNode(ISD::ADD, SL, VT, Op0, Op1), Op2); + } + } + + if (Op0->isDivergent() && !Op2->isDivergent()) { + // (v0 - s1) + s0 --> (s0 - s1) + v0 + if (Opc == ISD::ADD && Op0Opc == ISD::SUB) + return DAG.getNode(ISD::ADD, SL, VT, + DAG.getNode(ISD::SUB, SL, VT, Op1, Op2), Op0); + } + } + + return SDValue(); +} + static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, @@ -11288,6 +11387,9 @@ return V; } + if (SDValue V = reassociateSub(N, DAG)) + return V; + if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG()) return SDValue(); @@ -11330,6 +11432,9 @@ SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); + if (SDValue V = reassociateSub(N, DAG)) + return V; + if (VT != MVT::i32) return SDValue(); Index: llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -662,8 +662,8 @@ ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 +; GCN-NEXT: s_sub_i32 s0, s2, s3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; @@ -675,10 +675,10 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: s_sub_i32 s2, s2, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: @@ -711,8 +711,8 @@ ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 +; GCN-NEXT: s_sub_i32 s0, s2, s3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; @@ -724,10 +724,10 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: s_sub_i32 s2, s2, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: Index: llvm/test/CodeGen/AMDGPU/reassoc-sub-add.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/reassoc-sub-add.ll @@ -0,0 +1,337 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s + +; s0 + (s1 - v0) --> (s0 + s1) - v0 +define amdgpu_kernel void @reassoc_sub_add_s0s1v0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { +; GFX8-LABEL: reassoc_sub_add_s0s1v0_i32: +; GFX8: ; %bb.0: ; %bb +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_i32 s2, s3, s2 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: reassoc_sub_add_s0s1v0_i32: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s2, s3, s2 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +bb: + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() + %sub = sub i32 %x, %tid + %add = add i32 %y, %sub + store i32 %add, ptr addrspace(1) %arg, align 4 + ret void +} + +; s0 + (v0 - s1) --> (s0 - s1) + v0 +define amdgpu_kernel void @reassoc_sub_add_s0v0s1_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { +; GFX8-LABEL: reassoc_sub_add_s0v0s1_i32: +; GFX8: ; %bb.0: ; %bb +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_sub_i32 s2, s3, s2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: reassoc_sub_add_s0v0s1_i32: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_sub_i32 s2, s3, s2 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +bb: + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() + %sub = sub i32 %tid, %x + %add = add i32 %y, %sub + store i32 %add, ptr addrspace(1) %arg, align 4 + ret void +} + +; (v0 - s1) + s0 --> (s0 - s1) + v0 +define amdgpu_kernel void @reassoc_sub_add_v0s1s0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { +; GFX8-LABEL: reassoc_sub_add_v0s1s0_i32: +; GFX8: ; %bb.0: ; %bb +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_sub_i32 s2, s3, s2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: reassoc_sub_add_v0s1s0_i32: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_sub_i32 s2, s3, s2 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +bb: + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() + %sub = sub i32 %tid, %x + %add = add i32 %sub, %y + store i32 %add, ptr addrspace(1) %arg, align 4 + ret void +} + +; s0 - (s1 - v0) --> (s0 - s1) + v0 +define amdgpu_kernel void @reassoc_sub_sub_s0s1v0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { +; GFX8-LABEL: reassoc_sub_sub_s0s1v0_i32: +; GFX8: ; %bb.0: ; %bb +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_sub_i32 s2, s3, s2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: reassoc_sub_sub_s0s1v0_i32: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_sub_i32 s2, s3, s2 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +bb: + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() + %sub1 = sub i32 %x, %tid + %sub2 = sub i32 %y, %sub1 + store i32 %sub2, ptr addrspace(1) %arg, align 4 + ret void +} + +; s0 - (v0 - s1) --> (s0 + s1) - v0 +define amdgpu_kernel void @reassoc_sub_sub_s0v0s1_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { +; GFX8-LABEL: reassoc_sub_sub_s0v0s1_i32: +; GFX8: ; %bb.0: ; %bb +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_i32 s2, s3, s2 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: reassoc_sub_sub_s0v0s1_i32: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s2, s3, s2 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +bb: + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() + %sub1 = sub i32 %tid, %x + %sub2 = sub i32 %y, %sub1 + store i32 %sub2, ptr addrspace(1) %arg, align 4 + ret void +} + +; s0 - (s1 + v0) --> (s0 - s1) - v0 +define amdgpu_kernel void @reassoc_add_sub_s0s1v0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { +; GFX8-LABEL: reassoc_add_sub_s0s1v0_i32: +; GFX8: ; %bb.0: ; %bb +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_sub_i32 s2, s3, s2 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: reassoc_add_sub_s0s1v0_i32: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_sub_i32 s2, s3, s2 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +bb: + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() + %add = add i32 %x, %tid + %sub = sub i32 %y, %add + store i32 %sub, ptr addrspace(1) %arg, align 4 + ret void +} + +; s0 - (v0 + s1) --> (s0 - s1) - v0 +define amdgpu_kernel void @reassoc_add_sub_s0v0s1_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { +; GFX8-LABEL: reassoc_add_sub_s0v0s1_i32: +; GFX8: ; %bb.0: ; %bb +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_sub_i32 s2, s3, s2 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: reassoc_add_sub_s0v0s1_i32: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_sub_i32 s2, s3, s2 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +bb: + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() + %add = add i32 %tid, %x + %sub = sub i32 %y, %add + store i32 %sub, ptr addrspace(1) %arg, align 4 + ret void +} + +; (s1 + v0) - s0 --> (s1 - s0) + v0 +define amdgpu_kernel void @reassoc_add_sub_s1v0s0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { +; GFX8-LABEL: reassoc_add_sub_s1v0s0_i32: +; GFX8: ; %bb.0: ; %bb +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_sub_i32 s2, s3, s2 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: reassoc_add_sub_s1v0s0_i32: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_sub_i32 s2, s3, s2 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +bb: + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() + %add = add i32 %tid, %x + %sub = sub i32 %y, %add + store i32 %sub, ptr addrspace(1) %arg, align 4 + ret void +} + +; (s1 - v0) - s0 --> (s1 - s0) - v0 +define amdgpu_kernel void @reassoc_sub_sub_s1v0s0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { +; GFX8-LABEL: reassoc_sub_sub_s1v0s0_i32: +; GFX8: ; %bb.0: ; %bb +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_sub_i32 s2, s2, s3 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: reassoc_sub_sub_s1v0s0_i32: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +bb: + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() + %sub1 = sub i32 %x, %tid + %sub2 = sub i32 %sub1, %y + store i32 %sub2, ptr addrspace(1) %arg, align 4 + ret void +} + +; (s1 - v0) + s0 --> (s0 + s1) - v0 +define amdgpu_kernel void @reassoc_sub_add_s1v0s0_i32(ptr addrspace(1) %arg, i32 %x, i32 %y) { +; GFX8-LABEL: reassoc_sub_add_s1v0s0_i32: +; GFX8: ; %bb.0: ; %bb +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_i32 s2, s2, s3 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: reassoc_sub_add_s1v0s0_i32: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +bb: + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() + %sub = sub i32 %x, %tid + %add = add i32 %sub, %y + store i32 %add, ptr addrspace(1) %arg, align 4 + ret void +} + +; If the uniform value is a constant, DAGCombiner also try to canonicalize sub patterns +; cause the test dead loop. For now we limit the optimization to variables to workaround it. +define amdgpu_cs void @f(i32 inreg %arg, i32 %arg1) { +; GFX8-LABEL: f: +; GFX8: ; %bb.0: ; %bb +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: s_mov_b32 s0, 0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 +; GFX8-NEXT: s_mov_b32 s1, s0 +; GFX8-NEXT: s_mov_b32 s2, s0 +; GFX8-NEXT: s_mov_b32 s3, s0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: f: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: s_mov_b32 s2, s0 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +bb: + %i = add i32 %arg1, %arg + %i2 = sub i32 0, %i + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %i2, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() +declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg)