Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3870,32 +3870,37 @@ return DAG.getNode(ISD::ADD, DL, VT, Add, DAG.getConstant(1, DL, VT)); } - // Hoist one-use addition by non-opaque constant: - // (x + C) - y -> (x - y) + C - if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() && - isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) { - SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1); - return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(1)); + // Hoist one-use addition by non-opaque constant unless it increases the + // number of divergent nodes: + if (N0.hasOneUse() && (N0->isDivergent() || !N1->isDivergent())) { + // (x + C) - y -> (x - y) + C + if (N0.getOpcode() == ISD::ADD && + isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) { + SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1); + return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(1)); + } + // (x - C) - y -> (x - y) - C + // This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors. + if (N0.getOpcode() == ISD::SUB && + isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) { + SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1); + return DAG.getNode(ISD::SUB, DL, VT, Sub, N0.getOperand(1)); + } + // (C - x) - y -> C - (x + y) + if (N0.getOpcode() == ISD::SUB && + isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) { + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1), N1); + return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), Add); + } } + // y - (x + C) -> (y - x) - C if (N1.getOpcode() == ISD::ADD && N1.hasOneUse() && - isConstantOrConstantVector(N1.getOperand(1), /*NoOpaques=*/true)) { + isConstantOrConstantVector(N1.getOperand(1), /*NoOpaques=*/true) && + (N1->isDivergent() || !N0->isDivergent())) { SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(0)); return DAG.getNode(ISD::SUB, DL, VT, Sub, N1.getOperand(1)); } - // (x - C) - y -> (x - y) - C - // This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors. - if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() && - isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) { - SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1); - return DAG.getNode(ISD::SUB, DL, VT, Sub, N0.getOperand(1)); - } - // (C - x) - y -> C - (x + y) - if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() && - isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) { - SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1), N1); - return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), Add); - } // If the target's bool is represented as 0/-1, prefer to make this 'add 0/-1' // rather than 'sub 0/1' (the sext should get folded). Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -13,12 +13,12 @@ ; VARIANT0-NEXT: s_mov_b32 s6, 0 ; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT0-NEXT: v_mov_b32_e32 v2, 0 -; VARIANT0-NEXT: v_not_b32_e32 v3, v0 ; VARIANT0-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VARIANT0-NEXT: s_barrier -; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s0, v3 +; VARIANT0-NEXT: s_add_i32 s0, s0, -1 +; VARIANT0-NEXT: v_sub_i32_e32 v3, vcc, s0, v0 ; VARIANT0-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT0-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 ; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 @@ -34,11 +34,11 @@ ; VARIANT1-NEXT: s_mov_b32 s6, 0 ; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT1-NEXT: v_mov_b32_e32 v2, 0 -; VARIANT1-NEXT: v_not_b32_e32 v3, v0 ; VARIANT1-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT1-NEXT: s_barrier -; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s0, v3 +; VARIANT1-NEXT: s_add_i32 s0, s0, -1 +; VARIANT1-NEXT: v_sub_i32_e32 v3, vcc, s0, v0 ; VARIANT1-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT1-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 ; VARIANT1-NEXT: s_waitcnt expcnt(0) @@ -54,7 +54,8 @@ ; VARIANT2-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT2-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3] -; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s4 +; VARIANT2-NEXT: s_add_i32 s4, s4, -1 +; VARIANT2-NEXT: v_sub_u32_e32 v0, s4, v0 ; VARIANT2-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT2-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VARIANT2-NEXT: v_mov_b32_e32 v3, s3 @@ -74,7 +75,8 @@ ; VARIANT3-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT3-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] -; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s4 +; VARIANT3-NEXT: s_add_i32 s4, s4, -1 +; VARIANT3-NEXT: v_sub_u32_e32 v0, s4, v0 ; VARIANT3-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VARIANT3-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VARIANT3-NEXT: v_mov_b32_e32 v3, s3