Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -442,6 +442,19 @@ ATOMIC_DEC, BUFFER_LOAD, BUFFER_LOAD_FORMAT, + BUFFER_STORE, + BUFFER_STORE_FORMAT, + BUFFER_ATOMIC_SWAP, + BUFFER_ATOMIC_ADD, + BUFFER_ATOMIC_SUB, + BUFFER_ATOMIC_SMIN, + BUFFER_ATOMIC_UMIN, + BUFFER_ATOMIC_SMAX, + BUFFER_ATOMIC_UMAX, + BUFFER_ATOMIC_AND, + BUFFER_ATOMIC_OR, + BUFFER_ATOMIC_XOR, + BUFFER_ATOMIC_CMPSWAP, LAST_AMDGPU_ISD_NUMBER }; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3987,6 +3987,19 @@ NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) + NODE_NAME_CASE(BUFFER_STORE) + NODE_NAME_CASE(BUFFER_STORE_FORMAT) + NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) + NODE_NAME_CASE(BUFFER_ATOMIC_ADD) + NODE_NAME_CASE(BUFFER_ATOMIC_SUB) + NODE_NAME_CASE(BUFFER_ATOMIC_SMIN) + NODE_NAME_CASE(BUFFER_ATOMIC_UMIN) + NODE_NAME_CASE(BUFFER_ATOMIC_SMAX) + NODE_NAME_CASE(BUFFER_ATOMIC_UMAX) + NODE_NAME_CASE(BUFFER_ATOMIC_AND) + NODE_NAME_CASE(BUFFER_ATOMIC_OR) + NODE_NAME_CASE(BUFFER_ATOMIC_XOR) + NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -966,12 +966,12 @@ >; } -defm : MUBUF_StoreIntrinsicPat; -defm : MUBUF_StoreIntrinsicPat; -defm : MUBUF_StoreIntrinsicPat; -defm : MUBUF_StoreIntrinsicPat; -defm : MUBUF_StoreIntrinsicPat; -defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; +defm : MUBUF_StoreIntrinsicPat; //===----------------------------------------------------------------------===// // buffer_atomic patterns @@ -1013,19 +1013,19 @@ >; } -defm : BufferAtomicPatterns; -defm : BufferAtomicPatterns; -defm : BufferAtomicPatterns; -defm : BufferAtomicPatterns; -defm : BufferAtomicPatterns; -defm : BufferAtomicPatterns; -defm : BufferAtomicPatterns; -defm : BufferAtomicPatterns; -defm : BufferAtomicPatterns; -defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; def : GCNPat< - (int_amdgcn_buffer_atomic_cmpswap + (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, 0, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$slc), @@ -1037,7 +1037,7 @@ >; def : GCNPat< - (int_amdgcn_buffer_atomic_cmpswap + (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$slc), @@ -1049,7 +1049,7 @@ >; def : GCNPat< - (int_amdgcn_buffer_atomic_cmpswap + (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, 0, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$slc), @@ -1061,7 +1061,7 @@ >; def : GCNPat< - (int_amdgcn_buffer_atomic_cmpswap + (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$slc), Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -4233,6 +4233,95 @@ return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, Op->getVTList(), Ops, VT, MMO); } + case Intrinsic::amdgcn_buffer_atomic_swap: + case Intrinsic::amdgcn_buffer_atomic_add: + case Intrinsic::amdgcn_buffer_atomic_sub: + case Intrinsic::amdgcn_buffer_atomic_smin: + case Intrinsic::amdgcn_buffer_atomic_umin: + case Intrinsic::amdgcn_buffer_atomic_smax: + case Intrinsic::amdgcn_buffer_atomic_umax: + case Intrinsic::amdgcn_buffer_atomic_and: + case Intrinsic::amdgcn_buffer_atomic_or: + case Intrinsic::amdgcn_buffer_atomic_xor: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + Op.getOperand(5), // offset + Op.getOperand(6) // slc + }; + EVT VT = Op.getOperand(3).getValueType(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOVolatile, + VT.getStoreSize(), 4); + unsigned Opcode = 0; + + switch (IntrID) { + case Intrinsic::amdgcn_buffer_atomic_swap: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP; + break; + case Intrinsic::amdgcn_buffer_atomic_add: + Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD; + break; + case Intrinsic::amdgcn_buffer_atomic_sub: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB; + break; + case Intrinsic::amdgcn_buffer_atomic_smin: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN; + break; + case Intrinsic::amdgcn_buffer_atomic_umin: + Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN; + break; + case Intrinsic::amdgcn_buffer_atomic_smax: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX; + break; + case Intrinsic::amdgcn_buffer_atomic_umax: + Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX; + break; + case Intrinsic::amdgcn_buffer_atomic_and: + Opcode = AMDGPUISD::BUFFER_ATOMIC_AND; + break; + case Intrinsic::amdgcn_buffer_atomic_or: + Opcode = AMDGPUISD::BUFFER_ATOMIC_OR; + break; + case Intrinsic::amdgcn_buffer_atomic_xor: + Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; + break; + default: + llvm_unreachable("unhandled atomic opcode"); + } + + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO); + } + + case Intrinsic::amdgcn_buffer_atomic_cmpswap: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // src + Op.getOperand(3), // cmp + Op.getOperand(4), // rsrc + Op.getOperand(5), // vindex + Op.getOperand(6), // offset + Op.getOperand(7) // slc + }; + EVT VT = Op.getOperand(4).getValueType(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOVolatile, + VT.getStoreSize(), 4); + + return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, + Op->getVTList(), Ops, VT, MMO); + } + // Basic sample. case Intrinsic::amdgcn_image_sample: case Intrinsic::amdgcn_image_sample_cl: @@ -4460,6 +4549,30 @@ Op->getVTList(), Ops, VT, MMO); } + case Intrinsic::amdgcn_buffer_store: + case Intrinsic::amdgcn_buffer_store_format: { + SDValue Ops[] = { + Chain, + Op.getOperand(2), // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + Op.getOperand(5), // offset + Op.getOperand(6), // glc + Op.getOperand(7) // slc + }; + EVT VT = Op.getOperand(3).getValueType(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable, + VT.getStoreSize(), 4); + + unsigned Opcode = IntrinsicID == Intrinsic::amdgcn_buffer_store ? + AMDGPUISD::BUFFER_STORE : + AMDGPUISD::BUFFER_STORE_FORMAT; + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO); + } + default: return Op; } Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -93,6 +93,53 @@ def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SDTBufferStore : SDTypeProfile<0, 6, + [ // vdata + SDTCisVT<1, v4i32>, // rsrc + SDTCisVT<2, i32>, // vindex + SDTCisVT<3, i32>, // offset + SDTCisVT<4, i1>, // glc + SDTCisVT<5, i1>]>; // slc + +def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore, + [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>; +def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", SDTBufferStore, + [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>; + +class SDBufferAtomic : SDNode , // dst + SDTCisVT<1, i32>, // vdata + SDTCisVT<2, v4i32>, // rsrc + SDTCisVT<3, i32>, // vindex + SDTCisVT<4, i32>, // offset + SDTCisVT<5, i1>]>, // slc + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] +>; + +def SIbuffer_atomic_swap : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SWAP">; +def SIbuffer_atomic_add : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_ADD">; +def SIbuffer_atomic_sub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SUB">; +def SIbuffer_atomic_smin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SMIN">; +def SIbuffer_atomic_umin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMIN">; +def SIbuffer_atomic_smax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SMAX">; +def SIbuffer_atomic_umax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMAX">; +def SIbuffer_atomic_and : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_AND">; +def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">; +def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">; + +def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", + SDTypeProfile<1, 6, + [SDTCisVT<0, i32>, // dst + SDTCisVT<1, i32>, // src + SDTCisVT<2, i32>, // cmp + SDTCisVT<3, v4i32>, // rsrc + SDTCisVT<4, i32>, // vindex + SDTCisVT<5, i32>, // offset + SDTCisVT<6, i1>]>, // slc + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] +>; + class SDSample : SDNode , SDTCisVT<2, v8i32>, SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]> Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -2,6 +2,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI ;CHECK-LABEL: {{^}}test1: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc ;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc ;CHECK: s_waitcnt vmcnt(0) @@ -32,6 +33,7 @@ } ;CHECK-LABEL: {{^}}test2: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 idxen glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_sub v0, v1, s[0:3], 0 idxen glc @@ -69,6 +71,7 @@ ; create copies which we don't bother to track here. ; ;CHECK-LABEL: {{^}}test3: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc ;CHECK: s_waitcnt vmcnt(0) ;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll @@ -2,6 +2,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s ;CHECK-LABEL: {{^}}buffer_store: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 ;CHECK: buffer_store_format_xyzw v[4:7], off, s[0:3], 0 glc ;CHECK: buffer_store_format_xyzw v[8:11], off, s[0:3], 0 slc @@ -14,6 +15,7 @@ } ;CHECK-LABEL: {{^}}buffer_store_immoffs: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 offset:42 define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { main_body: @@ -22,6 +24,7 @@ } ;CHECK-LABEL: {{^}}buffer_store_idx: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { main_body: @@ -30,6 +33,7 @@ } ;CHECK-LABEL: {{^}}buffer_store_ofs: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { main_body: @@ -38,6 +42,7 @@ } ;CHECK-LABEL: {{^}}buffer_store_both: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { main_body: @@ -47,6 +52,7 @@ ;CHECK-LABEL: {{^}}buffer_store_both_reversed: ;CHECK: v_mov_b32_e32 v6, v4 +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { main_body: @@ -57,6 +63,7 @@ ; Ideally, the register allocator would avoid the wait here ; ;CHECK-LABEL: {{^}}buffer_store_wait: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen ;CHECK: s_waitcnt expcnt(0) ;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen @@ -71,6 +78,7 @@ } ;CHECK-LABEL: {{^}}buffer_store_x1: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { main_body: @@ -79,6 +87,7 @@ } ;CHECK-LABEL: {{^}}buffer_store_x2: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) { main_body: Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll @@ -2,6 +2,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s ;CHECK-LABEL: {{^}}buffer_store: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc ;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc @@ -14,6 +15,7 @@ } ;CHECK-LABEL: {{^}}buffer_store_immoffs: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { main_body: @@ -22,6 +24,7 @@ } ;CHECK-LABEL: {{^}}buffer_store_idx: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { main_body: @@ -30,6 +33,7 @@ } ;CHECK-LABEL: {{^}}buffer_store_ofs: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { main_body: @@ -38,6 +42,7 @@ } ;CHECK-LABEL: {{^}}buffer_store_both: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { main_body: @@ -47,6 +52,7 @@ ;CHECK-LABEL: {{^}}buffer_store_both_reversed: ;CHECK: v_mov_b32_e32 v6, v4 +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { main_body: @@ -57,6 +63,7 @@ ; Ideally, the register allocator would avoid the wait here ; ;CHECK-LABEL: {{^}}buffer_store_wait: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen ;CHECK: s_waitcnt expcnt(0) ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen @@ -71,6 +78,7 @@ } ;CHECK-LABEL: {{^}}buffer_store_x1: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { main_body: @@ -79,6 +87,7 @@ } ;CHECK-LABEL: {{^}}buffer_store_x2: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 { main_body: Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll @@ -2,6 +2,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI ;CHECK-LABEL: {{^}}image_atomic_swap: +;CHECK-NOT: s_waitcnt ;SI: image_atomic_swap v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x04,0x00,0x00] ;VI: image_atomic_swap v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x04,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) @@ -13,6 +14,7 @@ } ;CHECK-LABEL: {{^}}image_atomic_swap_v2i32: +;CHECK-NOT: s_waitcnt ;SI: image_atomic_swap v2, v[0:1], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x02,0x00,0x00] ;VI: image_atomic_swap v2, v[0:1], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x02,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) @@ -24,6 +26,7 @@ } ;CHECK-LABEL: {{^}}image_atomic_swap_i32: +;CHECK-NOT: s_waitcnt ;SI: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x01,0x00,0x00] ;VI: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x01,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) @@ -35,6 +38,7 @@ } ;CHECK-LABEL: {{^}}image_atomic_cmpswap: +;CHECK-NOT: s_waitcnt ;SI: image_atomic_cmpswap v[4:5], v[0:3], s[0:7] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x40,0xf0,0x00,0x04,0x00,0x00] ;VI: image_atomic_cmpswap v[4:5], v[0:3], s[0:7] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x44,0xf0,0x00,0x04,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) @@ -47,6 +51,7 @@ } ;CHECK-LABEL: {{^}}image_atomic_add: +;CHECK-NOT: s_waitcnt ;SI: image_atomic_add v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x44,0xf0,0x00,0x04,0x00,0x00] ;VI: image_atomic_add v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x48,0xf0,0x00,0x04,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) @@ -58,6 +63,7 @@ } ;CHECK-LABEL: {{^}}image_atomic_sub: +;CHECK-NOT: s_waitcnt ;SI: image_atomic_sub v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x48,0xf0,0x00,0x04,0x00,0x00] ;VI: image_atomic_sub v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x4c,0xf0,0x00,0x04,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) @@ -69,6 +75,7 @@ } ;CHECK-LABEL: {{^}}image_atomic_unchanged: +;CHECK-NOT: s_waitcnt ;CHECK: image_atomic_smin v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x50,0xf0,0x00,0x04,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) ;CHECK: image_atomic_umin v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x54,0xf0,0x00,0x04,0x00,0x00] Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}image_load_v4i32: +; GCN-NOT: s_waitcnt ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm ; GCN: s_waitcnt vmcnt(0) define amdgpu_ps <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { @@ -11,6 +12,7 @@ } ; GCN-LABEL: {{^}}image_load_v2i32: +; GCN-NOT: s_waitcnt ; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm ; GCN: s_waitcnt vmcnt(0) define amdgpu_ps <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 { @@ -20,6 +22,7 @@ } ; GCN-LABEL: {{^}}image_load_i32: +; GCN-NOT: s_waitcnt ; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm ; GCN: s_waitcnt vmcnt(0) define amdgpu_ps <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) #0 { @@ -29,6 +32,7 @@ } ; GCN-LABEL: {{^}}image_load_mip: +; GCN-NOT: s_waitcnt ; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm ; GCN: s_waitcnt vmcnt(0) define amdgpu_ps <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { @@ -38,6 +42,7 @@ } ; GCN-LABEL: {{^}}image_load_1: +; GCN-NOT: s_waitcnt ; GCN: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm ; GCN: s_waitcnt vmcnt(0) define amdgpu_ps float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { @@ -48,6 +53,7 @@ } ; GCN-LABEL: {{^}}image_load_f32_v2i32: +; GCN-NOT: s_waitcnt ; GCN: image_load {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm ; GCN: s_waitcnt vmcnt(0) define amdgpu_ps float @image_load_f32_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 { @@ -57,6 +63,7 @@ } ; GCN-LABEL: {{^}}image_load_v2f32_v4i32: +; GCN-NOT: s_waitcnt ; GCN: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm ; GCN: s_waitcnt vmcnt(0) define amdgpu_ps <2 x float> @image_load_v2f32_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { @@ -66,6 +73,7 @@ } ; GCN-LABEL: {{^}}image_store_v4i32: +; GCN-NOT: s_waitcnt ; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 { main_body: @@ -74,6 +82,7 @@ } ; GCN-LABEL: {{^}}image_store_v2i32: +; GCN-NOT: s_waitcnt ; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm define amdgpu_ps void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) #0 { main_body: @@ -82,6 +91,7 @@ } ; GCN-LABEL: {{^}}image_store_i32: +; GCN-NOT: s_waitcnt ; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm define amdgpu_ps void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) #0 { main_body: @@ -90,6 +100,7 @@ } ; GCN-LABEL: {{^}}image_store_f32_i32: +; GCN-NOT: s_waitcnt ; GCN: image_store {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm define amdgpu_ps void @image_store_f32_i32(<8 x i32> inreg %rsrc, float %data, i32 %coords) #0 { main_body: @@ -98,6 +109,7 @@ } ; GCN-LABEL: {{^}}image_store_v2f32_v4i32: +; GCN-NOT: s_waitcnt ; GCN: image_store {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm define amdgpu_ps void @image_store_v2f32_v4i32(<8 x i32> inreg %rsrc, <2 x float> %data, <4 x i32> %coords) #0 { main_body: @@ -106,6 +118,7 @@ } ; GCN-LABEL: {{^}}image_store_mip: +; GCN-NOT: s_waitcnt ; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 { main_body: @@ -114,6 +127,7 @@ } ; GCN-LABEL: {{^}}getresinfo: +; GCN-NOT: s_waitcnt ; GCN: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf define amdgpu_ps void @getresinfo() #0 { main_body: Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}test1: +; CHECK-NOT: s_waitcnt ; CHECK: image_store ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0){{$}} ; CHECK-NEXT: image_store @@ -17,6 +18,7 @@ ; emitted as late as possible. ; ; CHECK-LABEL: {{^}}test2: +; CHECK-NOT: s_waitcnt ; CHECK: image_load ; CHECK-NEXT: s_waitcnt ; CHECK: s_waitcnt vmcnt(0){{$}}