Index: lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -30,6 +30,7 @@ enum DPP_CTRL { DPP_ROW_SR1 = 0x111, DPP_ROW_SR2 = 0x112, + DPP_ROW_SR3 = 0x113, DPP_ROW_SR4 = 0x114, DPP_ROW_SR8 = 0x118, DPP_WF_SR1 = 0x138, @@ -250,20 +251,17 @@ Value *const V = I.getOperand(ValIdx); // We need to know how many lanes are active within the wavefront, and we do - // this by getting the exec register, which tells us all the lanes that are - // active. - MDNode *const RegName = - llvm::MDNode::get(Context, llvm::MDString::get(Context, "exec")); - Value *const Metadata = llvm::MetadataAsValue::get(Context, RegName); - CallInst *const Exec = - B.CreateIntrinsic(Intrinsic::read_register, {B.getInt64Ty()}, {Metadata}); - setConvergent(Exec); + // this by doing a ballot of active lanes. + CallInst *const Ballot = + B.CreateIntrinsic(Intrinsic::amdgcn_icmp, {B.getInt32Ty()}, + {B.getInt32(1), B.getInt32(0), B.getInt32(33)}); + setConvergent(Ballot); // We need to know how many lanes are active within the wavefront that are // below us. If we counted each lane linearly starting from 0, a lane is // below us only if its associated index was less than ours. We do this by // using the mbcnt intrinsic. - Value *const BitCast = B.CreateBitCast(Exec, VecTy); + Value *const BitCast = B.CreateBitCast(Ballot, VecTy); Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); CallInst *const PartialMbcnt = B.CreateIntrinsic( @@ -279,44 +277,43 @@ // If we have a divergent value in each lane, we need to combine the value // using DPP. if (ValDivergent) { + Value *const Identity = B.getIntN(TyBitWidth, 0); + // First we need to set all inactive invocations to 0, so that they can // correctly contribute to the final result. - CallInst *const SetInactive = B.CreateIntrinsic( - Intrinsic::amdgcn_set_inactive, Ty, {V, B.getIntN(TyBitWidth, 0)}); + CallInst *const SetInactive = + B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); setConvergent(SetInactive); - NewV = SetInactive; - - const unsigned Iters = 6; - const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2, - DPP_ROW_SR4, DPP_ROW_SR8, - DPP_ROW_BCAST15, DPP_ROW_BCAST31}; - const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc}; - // This loop performs an inclusive scan across the wavefront, with all lanes + CallInst *const FirstDPP = + B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty, + {Identity, SetInactive, B.getInt32(DPP_WF_SR1), + B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); + setConvergent(FirstDPP); + NewV = FirstDPP; + + const unsigned Iters = 7; + const unsigned DPPCtrl[Iters] = { + DPP_ROW_SR1, DPP_ROW_SR2, DPP_ROW_SR3, DPP_ROW_SR4, + DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31}; + const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xf, 0xa, 0xc}; + const unsigned BankMask[Iters] = {0xf, 0xf, 0xf, 0xe, 0xc, 0xf, 0xf}; + + // This loop performs an exclusive scan across the wavefront, with all lanes // active (by using the WWM intrinsic). for (unsigned Idx = 0; Idx < Iters; Idx++) { - CallInst *const DPP = B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, Ty, - {NewV, B.getInt32(DPPCtrl[Idx]), - B.getInt32(RowMask[Idx]), - B.getInt32(0xf), B.getFalse()}); + Value *const UpdateValue = Idx < 3 ? FirstDPP : NewV; + CallInst *const DPP = B.CreateIntrinsic( + Intrinsic::amdgcn_update_dpp, Ty, + {Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]), + B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()}); setConvergent(DPP); - Value *const WWM = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP); - NewV = B.CreateBinOp(Op, NewV, WWM); - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); + NewV = B.CreateBinOp(Op, NewV, DPP); } - // NewV has returned the inclusive scan of V, but for the lane offset we - // require an exclusive scan. We do this by shifting the values from the - // entire wavefront right by 1, and by setting the bound_ctrl (last argument - // to the intrinsic below) to true, we can guarantee that 0 will be shifted - // into the 0'th invocation. - CallInst *const DPP = - B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, {Ty}, - {NewV, B.getInt32(DPP_WF_SR1), B.getInt32(0xf), - B.getInt32(0xf), B.getTrue()}); - setConvergent(DPP); - LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP); + LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); + NewV = B.CreateBinOp(Op, NewV, SetInactive); // Read the value from the last lane, which has accumlated the values of // each active lane in the wavefront. This will be our new value with which @@ -344,9 +341,12 @@ } else { llvm_unreachable("Unhandled atomic bit width"); } + + // Finally mark the readlanes in the WWM section. + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); } else { // Get the total number of active lanes we have by using popcount. - Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Exec); + Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot); Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false); // Calculate the new value we will be contributing to the atomic operation Index: test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -9,7 +9,7 @@ ; Show that what the atomic optimization pass will do for raw buffers. ; GCN-LABEL: add_i32_constant: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -24,7 +24,7 @@ } ; GCN-LABEL: add_i32_uniform: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -44,6 +44,14 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_add v{{[0-9]+}} +; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:3 row_mask:0xf bank_mask:0xf +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xe +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_add v[[value]] @@ -69,7 +77,7 @@ } ; GCN-LABEL: sub_i32_constant: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -84,7 +92,7 @@ } ; GCN-LABEL: sub_i32_uniform: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -104,6 +112,14 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} +; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:3 row_mask:0xf bank_mask:0xf +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xe +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] Index: test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -7,7 +7,7 @@ ; Show that what the atomic optimization pass will do for global pointers. ; GCN-LABEL: add_i32_constant: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -22,7 +22,7 @@ } ; GCN-LABEL: add_i32_uniform: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -54,7 +54,7 @@ } ; GCN-LABEL: add_i64_constant: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -70,7 +70,7 @@ } ; GCN-LABEL: add_i64_uniform: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -98,7 +98,7 @@ } ; GCN-LABEL: sub_i32_constant: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -113,7 +113,7 @@ } ; GCN-LABEL: sub_i32_uniform: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -145,7 +145,7 @@ } ; GCN-LABEL: sub_i64_constant: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -161,7 +161,7 @@ } ; GCN-LABEL: sub_i64_uniform: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] Index: test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -10,7 +10,7 @@ ; Show that what the atomic optimization pass will do for local pointers. ; GCN-LABEL: add_i32_constant: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -25,7 +25,7 @@ } ; GCN-LABEL: add_i32_uniform: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -57,7 +57,7 @@ } ; GCN-LABEL: add_i64_constant: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -73,7 +73,7 @@ } ; GCN-LABEL: add_i64_uniform: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -101,7 +101,7 @@ } ; GCN-LABEL: sub_i32_constant: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -116,7 +116,7 @@ } ; GCN-LABEL: sub_i32_uniform: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -148,7 +148,7 @@ } ; GCN-LABEL: sub_i64_constant: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -164,7 +164,7 @@ } ; GCN-LABEL: sub_i64_uniform: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] Index: test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -10,7 +10,7 @@ ; GCN-LABEL: add_i32_constant: ; GCN-LABEL: BB0_1: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -36,8 +36,9 @@ ; GCN-LABEL: add_i32_varying: ; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 -; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], exec_lo, 0 -; GFX8MORE: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], exec_hi, v[[mbcnt_lo]] +; GFX8MORE: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GFX8MORE: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] Index: test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -9,7 +9,7 @@ ; Show that what the atomic optimization pass will do for raw buffers. ; GCN-LABEL: add_i32_constant: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -24,7 +24,7 @@ } ; GCN-LABEL: add_i32_uniform: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -69,7 +69,7 @@ } ; GCN-LABEL: sub_i32_constant: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -84,7 +84,7 @@ } ; GCN-LABEL: sub_i32_uniform: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] Index: test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -9,7 +9,7 @@ ; Show that what the atomic optimization pass will do for struct buffers. ; GCN-LABEL: add_i32_constant: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -24,7 +24,7 @@ } ; GCN-LABEL: add_i32_uniform: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -82,7 +82,7 @@ } ; GCN-LABEL: sub_i32_constant: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] @@ -97,7 +97,7 @@ } ; GCN-LABEL: sub_i32_uniform: -; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 ; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]