Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -498,7 +498,11 @@ llvm_i1_ty, // glc(imm) llvm_i1_ty], // slc(imm) [IntrReadMem], "", [SDNPMemOperand]>; + +// Assumes the buffer has stride != 0. def int_amdgcn_buffer_load_format : AMDGPUBufferLoad; + +// Assumes the buffer has stride == 0 def int_amdgcn_buffer_load : AMDGPUBufferLoad; class AMDGPUBufferStore : Intrinsic < @@ -510,7 +514,11 @@ llvm_i1_ty, // glc(imm) llvm_i1_ty], // slc(imm) [IntrWriteMem], "", [SDNPMemOperand]>; + +// Assumes the buffer has stride != 0 def int_amdgcn_buffer_store_format : AMDGPUBufferStore; + +// Assumes the buffer has stride == 0 def int_amdgcn_buffer_store : AMDGPUBufferStore; def int_amdgcn_tbuffer_load : Intrinsic < Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -769,6 +769,10 @@ def EnableLateCFGStructurize : Predicate< "EnableLateStructurizeCFG">; +// On GFX9 IDXEN matters for interpretation of buffer sizes. +def IDXENEliminationAlwaysAllowed : Predicate < + "Subtarget->getGeneration() < AMDGPUSubtarget::GFX9">; + // Exists to help track down where SubtargetPredicate isn't set rather // than letting tablegen crash with an unhelpful error. def InvalidPred : Predicate<"predicate not set on instruction or pattern">; Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -1048,21 +1048,60 @@ >; } -defm : MUBUF_LoadIntrinsicPat; -defm : MUBUF_LoadIntrinsicPat; -defm : MUBUF_LoadIntrinsicPat; +multiclass MUBUF_IndexedLoadIntrinsicPat { + let SubtargetPredicate = IDXENEliminationAlwaysAllowed in { + def : GCNPat< + (vt (name v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc)), + (!cast(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : GCNPat< + (vt (name v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc)), + (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; + } + + def : GCNPat< + (vt (name v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc)), + (!cast(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : GCNPat< + (vt (name v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc)), + (!cast(opcode # _BOTHEN) + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; +} + +defm : MUBUF_IndexedLoadIntrinsicPat; +defm : MUBUF_IndexedLoadIntrinsicPat; +defm : MUBUF_IndexedLoadIntrinsicPat; let SubtargetPredicate = HasUnpackedD16VMem in { - defm : MUBUF_LoadIntrinsicPat; - defm : MUBUF_LoadIntrinsicPat; - defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_IndexedLoadIntrinsicPat; + defm : MUBUF_IndexedLoadIntrinsicPat; + defm : MUBUF_IndexedLoadIntrinsicPat; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { - defm : MUBUF_LoadIntrinsicPat; - defm : MUBUF_LoadIntrinsicPat; - defm : MUBUF_LoadIntrinsicPat; - defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_IndexedLoadIntrinsicPat; + defm : MUBUF_IndexedLoadIntrinsicPat; + defm : MUBUF_IndexedLoadIntrinsicPat; + defm : MUBUF_IndexedLoadIntrinsicPat; } // End HasPackedD16VMem. defm : MUBUF_LoadIntrinsicPat; @@ -1109,21 +1148,63 @@ >; } -defm : MUBUF_StoreIntrinsicPat; -defm : MUBUF_StoreIntrinsicPat; -defm : MUBUF_StoreIntrinsicPat; +multiclass MUBUF_IndexedStoreIntrinsicPat { + let SubtargetPredicate = IDXENEliminationAlwaysAllowed in { + def : GCNPat< + (name vt:$vdata, v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc), + (!cast(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : GCNPat< + (name vt:$vdata, v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc), + (!cast(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; + } + + def : GCNPat< + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc), + (!cast(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; + + def : GCNPat< + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc), + (!cast(opcode # _BOTHEN_exact) + $vdata, + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; +} + +defm : MUBUF_IndexedStoreIntrinsicPat; +defm : MUBUF_IndexedStoreIntrinsicPat; +defm : MUBUF_IndexedStoreIntrinsicPat; let SubtargetPredicate = HasUnpackedD16VMem in { - defm : MUBUF_StoreIntrinsicPat; - defm : MUBUF_StoreIntrinsicPat; - defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_IndexedStoreIntrinsicPat; + defm : MUBUF_IndexedStoreIntrinsicPat; + defm : MUBUF_IndexedStoreIntrinsicPat; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { - defm : MUBUF_StoreIntrinsicPat; - defm : MUBUF_StoreIntrinsicPat; - defm : MUBUF_StoreIntrinsicPat; - defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_IndexedStoreIntrinsicPat; + defm : MUBUF_IndexedStoreIntrinsicPat; + defm : MUBUF_IndexedStoreIntrinsicPat; + defm : MUBUF_IndexedStoreIntrinsicPat; } // End HasPackedD16VMem. defm : MUBUF_StoreIntrinsicPat; Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll @@ -1,10 +1,17 @@ ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI +;RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=GFX9 ;CHECK-LABEL: {{^}}buffer_load: -;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 -;CHECK: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc -;CHECK: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc +;SICI: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 +;SICI: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc +;SICI: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc +;VI: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 +;VI: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc +;VI: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc +;GFX9: buffer_load_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen +;GFX9: buffer_load_format_xyzw v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc +;GFX9: buffer_load_format_xyzw v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc ;CHECK: s_waitcnt define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { main_body: @@ -18,7 +25,9 @@ } ;CHECK-LABEL: {{^}}buffer_load_immoffs: -;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42 +;SICI: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42 +;VI: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42 +;GFX9: buffer_load_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 ;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { main_body: @@ -71,7 +80,9 @@ } ;CHECK-LABEL: {{^}}buffer_load_ofs: -;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen +;SICI: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen +;VI: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen +;GFX9: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { main_body: @@ -80,7 +91,9 @@ } ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: -;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60 +;SICI: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60 +;VI: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60 +;GFX9: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 ;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { main_body: @@ -109,7 +122,9 @@ } ;CHECK-LABEL: {{^}}buffer_load_x: -;CHECK: buffer_load_format_x v0, off, s[0:3], 0 +;SICI: buffer_load_format_x v0, off, s[0:3], 0 +;VI: buffer_load_format_x v0, off, s[0:3], 0 +;GFX9: buffer_load_format_x v0, v0, s[0:3], 0 idxen ;CHECK: s_waitcnt define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) { main_body: @@ -118,7 +133,9 @@ } ;CHECK-LABEL: {{^}}buffer_load_xy: -;CHECK: buffer_load_format_xy v[0:1], off, s[0:3], 0 +;SICI: buffer_load_format_xy v[0:1], off, s[0:3], 0 +;VI: buffer_load_format_xy v[0:1], off, s[0:3], 0 +;GFX9: buffer_load_format_xy v[0:1], v0, s[0:3], 0 idxen ;CHECK: s_waitcnt define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { main_body: Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll @@ -1,11 +1,15 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICIVI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICIVI +;RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=GFX9 ;CHECK-LABEL: {{^}}buffer_store: ;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 -;CHECK: buffer_store_format_xyzw v[4:7], off, s[0:3], 0 glc -;CHECK: buffer_store_format_xyzw v[8:11], off, s[0:3], 0 slc +;SICIVI: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 +;SICIVI: buffer_store_format_xyzw v[4:7], off, s[0:3], 0 glc +;SICIVI: buffer_store_format_xyzw v[8:11], off, s[0:3], 0 slc +;GFX9: buffer_store_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen +;GFX9: buffer_store_format_xyzw v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc +;GFX9: buffer_store_format_xyzw v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { main_body: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0) @@ -16,7 +20,8 @@ ;CHECK-LABEL: {{^}}buffer_store_immoffs: ;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 offset:42 +;SICIVI: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 offset:42 +;GFX9: buffer_store_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { main_body: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0) @@ -34,7 +39,8 @@ ;CHECK-LABEL: {{^}}buffer_store_ofs: ;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen +;SICIVI: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen +;GFX9: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { main_body: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0)