Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -224,6 +224,34 @@ llvm_i1_ty], // slc(imm) []>; +class AMDGPUBufferAtomic : Intrinsic < + [llvm_i32_ty], + [llvm_i32_ty, // vdata(VGPR) + llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(SGPR/VGPR/imm) + llvm_i1_ty], // slc(imm) + []>; +def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic; +def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic; +def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic; +def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic; +def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic; +def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic; +def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic; +def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic; +def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic; +def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic; +def int_amdgcn_buffer_atomic_cmpswap : Intrinsic< + [llvm_i32_ty], + [llvm_i32_ty, // src(VGPR) + llvm_i32_ty, // cmp(VGPR) + llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(SGPR/VGPR/imm) + llvm_i1_ty], // slc(imm) + []>; + def int_amdgcn_read_workdim : AMDGPUReadPreloadRegisterIntrinsic < "__builtin_amdgcn_read_workdim">; Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -139,6 +139,17 @@ case Intrinsic::amdgcn_image_atomic_inc: case Intrinsic::amdgcn_image_atomic_dec: case Intrinsic::amdgcn_image_atomic_cmpswap: + case Intrinsic::amdgcn_buffer_atomic_swap: + case Intrinsic::amdgcn_buffer_atomic_add: + case Intrinsic::amdgcn_buffer_atomic_sub: + case Intrinsic::amdgcn_buffer_atomic_smin: + case Intrinsic::amdgcn_buffer_atomic_umin: + case Intrinsic::amdgcn_buffer_atomic_smax: + case Intrinsic::amdgcn_buffer_atomic_umax: + case Intrinsic::amdgcn_buffer_atomic_and: + case Intrinsic::amdgcn_buffer_atomic_or: + case Intrinsic::amdgcn_buffer_atomic_xor: + case Intrinsic::amdgcn_buffer_atomic_cmpswap: return true; } Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -2814,10 +2814,25 @@ // for VI appropriately. } +multiclass MUBUFAtomicOther_m pattern, bit is_return> { + + def "" : MUBUF_Pseudo , + AtomicNoRet; + + let tfe = 0 in { + let addr64 = 0 in { + def _si : MUBUF_Real_si ; + } + + def _vi : MUBUF_Real_vi ; + } +} + multiclass MUBUF_Atomic { - let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in { + let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1 in { // No return variants let glc = 0 in { @@ -2835,6 +2850,34 @@ slc:$slc), name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0 >; + + let offen = 1, idxen = 0 in { + defm _OFFEN : MUBUFAtomicOther_m < + op, name#"_offen", (outs), + (ins rc:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$slc", [], 0 + >; + } + + let offen = 0, idxen = 1 in { + defm _IDXEN : MUBUFAtomicOther_m < + op, name#"_idxen", (outs), + (ins rc:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$slc", [], 0 + >; + } + + let offen = 1, idxen = 1 in { + defm _BOTHEN : MUBUFAtomicOther_m < + op, name#"_bothen", (outs), + (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$slc", + [], 0 + >; + } } // glc = 0 // Variant that return values @@ -2861,6 +2904,35 @@ i1:$slc), vt:$vdata_in))], 1 >; + let offen = 1, idxen = 0 in { + defm _RTN_OFFEN : MUBUFAtomicOther_m < + op, name#"_rtn_offen", (outs rc:$vdata), + (ins rc:$vdata_in, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#" glc"#"$slc", + [], 1 + >; + } + + let offen = 0, idxen = 1 in { + defm _RTN_IDXEN : MUBUFAtomicOther_m < + op, name#"_rtn_idxen", (outs rc:$vdata), + (ins rc:$vdata_in, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#" glc"#"$slc", + [], 1 + >; + } + + let offen = 1, idxen = 1 in { + defm _RTN_BOTHEN : MUBUFAtomicOther_m < + op, name#"_rtn_bothen", (outs rc:$vdata), + (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#" glc"#"$slc", + [], 1 + >; + } } // glc = 1 } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1 Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1011,7 +1011,9 @@ defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic < mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global >; -//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ , "buffer_atomic_cmpswap", []>; +defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Atomic < + mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag +>; defm BUFFER_ATOMIC_ADD : MUBUF_Atomic < mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global >; @@ -2188,6 +2190,106 @@ >; //===----------------------------------------------------------------------===// +// buffer_atomic patterns +//===----------------------------------------------------------------------===// +multiclass BufferAtomicPatterns { + def : Pat< + (name i32:$vdata_in, v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$slc), + (!cast(opcode # _RTN_OFFSET) $vdata_in, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $slc)) + >; + + def : Pat< + (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$slc), + (!cast(opcode # _RTN_IDXEN) $vdata_in, $vindex, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $slc)) + >; + + def : Pat< + (name i32:$vdata_in, v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$slc), + (!cast(opcode # _RTN_OFFEN) $vdata_in, $voffset, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $slc)) + >; + + def : Pat< + (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$slc), + (!cast(opcode # _RTN_BOTHEN) + $vdata_in, + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)) + >; +} + +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; + +def : Pat< + (int_amdgcn_buffer_atomic_cmpswap + i32:$data, i32:$cmp, v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$slc), + (EXTRACT_SUBREG + (BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET + (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), + $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + sub0) +>; + +def : Pat< + (int_amdgcn_buffer_atomic_cmpswap + i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$slc), + (EXTRACT_SUBREG + (BUFFER_ATOMIC_CMPSWAP_RTN_IDXEN + (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), + $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + sub0) +>; + +def : Pat< + (int_amdgcn_buffer_atomic_cmpswap + i32:$data, i32:$cmp, v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$slc), + (EXTRACT_SUBREG + (BUFFER_ATOMIC_CMPSWAP_RTN_OFFEN + (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), + $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + sub0) +>; + +def : Pat< + (int_amdgcn_buffer_atomic_cmpswap + i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$slc), + (EXTRACT_SUBREG + (BUFFER_ATOMIC_CMPSWAP_RTN_BOTHEN + (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + sub0) +>; + + +//===----------------------------------------------------------------------===// // S_GETREG_B32 Intrinsic Pattern. //===----------------------------------------------------------------------===// def : Pat < Index: test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -0,0 +1,104 @@ +;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence %s | FileCheck %s + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.swap( +define float @buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.add( +define float @buffer_atomic_add(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.sub( +define float @buffer_atomic_sub(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smin( +define float @buffer_atomic_smin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.smin(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umin( +define float @buffer_atomic_umin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.umin(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smax( +define float @buffer_atomic_smax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.smax(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umax( +define float @buffer_atomic_umax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.umax(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.and( +define float @buffer_atomic_and(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.and(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.or( +define float @buffer_atomic_or(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.or(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.xor( +define float @buffer_atomic_xor(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.xor(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap( +define float @buffer_atomic_cmpswap(<4 x i32> inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 { +main_body: + %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %r = bitcast i32 %orig to float + ret float %r +} + +declare i32 @llvm.amdgcn.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.and(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.or(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #1 + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -0,0 +1,116 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}test1: +;CHECK: buffer_atomic_swap v0, s[0:3], 0 glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, v[1:2], s[0:3], 0 idxen offen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc +;CHECK-DAG: s_waitcnt vmcnt(0) +;CHECK-DAG: s_movk_i32 [[SOFS:s[0-9]+]], 0x1000 +;CHECK: buffer_atomic_swap v0, s[0:3], [[SOFS]] glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, s[0:3], 0{{$}} +define float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) #0 { +main_body: + %o1 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %o2 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %o3 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0) + %o4 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0) + %ofs.5 = add i32 %voffset, 42 + %o5 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0) + %o6 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o5, <4 x i32> %rsrc, i32 0, i32 4096, i1 0) + %unused = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %out = bitcast i32 %o6 to float + ret float %out +} + +;CHECK-LABEL: {{^}}test2: +;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_sub v0, v1, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_smin v0, v1, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_umin v0, v1, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_smax v0, v1, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_umax v0, v1, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_and v0, v1, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 idxen glc +define float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) #0 { +main_body: + %t1 = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %t2 = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %t3 = call i32 @llvm.amdgcn.buffer.atomic.smin(i32 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %t4 = call i32 @llvm.amdgcn.buffer.atomic.umin(i32 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %t5 = call i32 @llvm.amdgcn.buffer.atomic.smax(i32 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %t6 = call i32 @llvm.amdgcn.buffer.atomic.umax(i32 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %t7 = call i32 @llvm.amdgcn.buffer.atomic.and(i32 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %t8 = call i32 @llvm.amdgcn.buffer.atomic.or(i32 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %t9 = call i32 @llvm.amdgcn.buffer.atomic.xor(i32 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %out = bitcast i32 %t9 to float + ret float %out +} + +; Ideally, we would teach tablegen & friends that cmpswap only modifies the +; first vgpr. Since we don't do that yet, the register allocator will have to +; create copies which we don't bother to track here. +; +;CHECK-LABEL: {{^}}test3: +;CHECK-DAG: s_movk_i32 [[SOFS:s[0-9]+]], 0x1000 +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v[2:3], s[0:3], 0 idxen offen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:42 glc +;CHECK-DAG: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, s[0:3], [[SOFS]] glc +define float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) #0 { +main_body: + %o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %o2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %o3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0) + %o4 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0) + %ofs.5 = add i32 %voffset, 42 + %o5 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0) + %o6 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 4096, i1 0) + +; Detecting the no-return variant doesn't work right now because of how the +; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG. +; Since there probably isn't a reasonable use-case of cmpswap that discards +; the return value, that seems okay. +; +; %unused = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) + %out = bitcast i32 %o6 to float + ret float %out +} + +declare i32 @llvm.amdgcn.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.and(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.or(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #1 + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind }